v1.6.11-rc0 (#345)

Co-authored-by: rtosholdings-bot <[email protected]>
rtosholdings · Apr 18, 2023 · 1a06bcb · 1a06bcb
1 parent b7715da
commit 1a06bcb
Show file tree

Hide file tree

Showing 6 changed files with 538 additions and 20 deletions.
diff --git a/riptable/rt_categorical.py b/riptable/rt_categorical.py
@@ -1531,6 +1531,7 @@ def __new__(
         grouping = None
 
         # prepare to eliminate sort_gb
+        arg_sort_gb = sort_gb
         if sort_display is not None:
             sort_gb = sort_display
 
@@ -1547,10 +1548,12 @@ def __new__(
             _lex = lex
 
         # default to bytestrings - more performant, less memory
+        arg_unicode = unicode
         if unicode is None:
             unicode = False
 
         # default to 1-based indexing (filtering, etc. fully supported in this mode)
+        arg_base_index = base_index
         if base_index is None:
             base_index = 1
 
@@ -1574,16 +1577,17 @@ def __new__(
                     # ordered = True
 
         # from categorical, deep copy - send to regular categorical.copy() to correctly preserve attributes
+        # use original arguments rather than defaulted ones to avoid warnings in copy().
         if isinstance(values, Categorical):
             return values.copy(
                 categories=categories,  # main data
                 ordered=ordered,
-                sort_gb=sort_gb,
+                sort_gb=arg_sort_gb,
                 lex=lex,  # sorting/hashing
-                base_index=base_index,
+                base_index=arg_base_index,
                 filter=filter,  # priority options
                 dtype=dtype,
-                unicode=unicode,
+                unicode=arg_unicode,
                 invalid=invalid,
                 auto_add=auto_add,  # misc options
                 from_matlab=from_matlab,

diff --git a/riptable/rt_datetime.py b/riptable/rt_datetime.py
@@ -16,12 +16,13 @@
     "strptime_to_nano",
 ]
 
-import time
-import warnings
 from datetime import date
 from datetime import datetime as dt
 from datetime import timezone
+import math
+import time
 from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+import warnings
 
 import numpy as np
 import riptide_cpp as rc
@@ -1543,6 +1544,94 @@ def isnotnan(self):
         """
         return ~self.isnan()
 
+    # ------------------------------------------------------------
+    def isfinite(self):
+        """
+        Return a boolean array that's True for each `Date` element that's
+        not a NaN (Not a Number), False otherwise.
+
+        Both the DateTime NaN (0) and Riptable's int32 sentinel value are
+        considered to be NaN.
+
+        Returns
+        -------
+        `FastArray`
+            A `FastArray` of booleans that's True for each non-NaN element,
+            False otherwise.
+
+        See Also
+        --------
+        Date.isnan, DateTimeNano.isnan, DateTimeNano.isnotnan, riptable.isnan,
+        riptable.isnotnan, riptable.isnanorzero, FastArray.isnan,
+        FastArray.isnotnan, FastArray.notna, FastArray.isnanorzero,
+        Categorical.isnan, Categorical.isnotnan, Categorical.notna
+        Dataset.mask_or_isnan :
+            Return a boolean array that's True for each `Dataset` row that
+            contains at least one NaN.
+        Dataset.mask_and_isnan :
+            Return a boolean array that's True for each all-NaN `Dataset` row.
+
+        Notes
+        -----
+        Riptable currently uses 0 for the DateTime NaN value. This constant is
+        held in the `DateTimeBase` class.
+
+        Examples
+        --------
+        >>> d = rt.Date.range('20190201', days = 3, step = 2)
+        >>> d[0] = 0
+        >>> d[1] = d.inv
+        >>> d
+        Date(['Inv', 'Inv', '2019-02-05'])
+        >>> d.isfinite()
+        FastArray([False, False,  True])
+        """
+        return ~self.isnan()
+
+    # ------------------------------------------------------------
+    def isnotfinite(self):
+        """
+        Return a boolean array that's True for each `Date` element that's
+        a NaN (Not a Number), False otherwise.
+
+        Both the DateTime NaN (0) and Riptable's int32 sentinel value are
+        considered to be NaN.
+
+        Returns
+        -------
+        `FastArray`
+            A `FastArray` of booleans that's True for each NaN element, False
+            otherwise.
+
+        See Also
+        --------
+        Date.isnotnan, DateTimeNano.isnan, DateTimeNano.isnotnan, riptable.isnan,
+        riptable.isnotnan, riptable.isnanorzero, FastArray.isnan,
+        FastArray.isnotnan, FastArray.notna, FastArray.isnanorzero,
+        Categorical.isnan, Categorical.isnotnan, Categorical.notna
+        Dataset.mask_or_isnan :
+            Return a boolean array that's True for each `Dataset` row that
+            contains at least one NaN.
+        Dataset.mask_and_isnan :
+            Return a boolean array that's True for each all-NaN `Dataset` row.
+
+        Notes
+        -----
+        Riptable currently uses 0 for the DateTime NaN value. This constant is
+        held in the `DateTimeBase` class.
+
+        Examples
+        --------
+        >>> d = rt.Date.range('20190201', days = 3, step = 2)
+        >>> d[0] = 0
+        >>> d[1] = d.inv
+        >>> d
+        Date(['Inv', 'Inv', '2019-02-05'])
+        >>> d.isnotfinite()
+        FastArray([ True,  True, False])
+        """
+        return self._fa.isnanorzero()
+
     # ------------------------------------------------------------
     @property
     def yyyymmdd(self):
@@ -2569,7 +2658,7 @@ class DateSpan(DateBase):
         "version": 0,  # if no version, assume before versions implemented
         "instance_vars": {"_display_length": DisplayLength.Long},
     }
-    NAN_DATE = INVALID_DICT[7]  # int32 sentinel
+    NAN_DATE = INVALID_DICT[np.dtype(np.int32).num]  # int32 sentinel
     forbidden_mathops = ()
 
     def __new__(cls, arr, unit=None):
@@ -5084,6 +5173,96 @@ def isnotnan(self):
         """
         return ~self.isnan()
 
+    # -------------------------------------------------------------
+    def isfinite(self):
+        """
+        Return a boolean array that's True for each `DateTimeNano` element
+        that's not a NaN (Not a Number), False otherwise.
+
+        Both the DateTime NaN (0) and Riptable's int64 sentinel value are
+        considered to be NaN.
+
+        Returns
+        -------
+        `FastArray`
+            A `FastArray` of booleans that's True for each non-NaN element,
+            False otherwise.
+
+        See Also
+        --------
+        DateTimeNano.isnan, Date.isnan, Date.isnotnan, riptable.isnan,
+        riptable.isnotnan, riptable.isnanorzero, FastArray.isnan,
+        FastArray.isnotnan, FastArray.notna, FastArray.isnanorzero,
+        Categorical.isnan, Categorical.isnotnan, Categorical.notna
+        Dataset.mask_or_isnan :
+            Return a boolean array that's True for each `Dataset` row that
+            contains at least one NaN.
+        Dataset.mask_and_isnan :
+            Return a boolean array that's True for each all-NaN `Dataset` row.
+
+        Notes
+        -----
+        Riptable currently uses 0 for the DateTime NaN value. This constant is
+        held in the `DateTimeBase` class.
+
+        Examples
+        --------
+        >>> dtn = rt.DateTimeNano(['20210101 09:31:15', '20210519 05:21:17',
+        ...                        '20210713 02:44:19'], from_tz = 'NYC')
+        >>> dtn[0] = 0
+        >>> dtn[1] = dtn.inv
+        >>> dtn
+        DateTimeNano(['Inv', 'Inv', '20210712 22:44:19.000000000'], to_tz='NYC')
+        >>> dtn.isfinite()
+        FastArray([False, False,  True])
+        """
+        return ~self.isnan()
+
+    # -------------------------------------------------------------
+    def isnotfinite(self):
+        """
+        Return a boolean array that's True for each `DateTimeNano` element
+        that's a NaN (Not a Number), False otherwise.
+
+        Both the DateTime NaN (0) and Riptable's int64 sentinel value are
+        considered to be NaN.
+
+        Returns
+        -------
+        `FastArray`
+            A `FastArray` of booleans that's True for each NaN element, False
+            otherwise.
+
+        See Also
+        --------
+        DateTimeNano.isnotnan, Date.isnan, Date.isnotnan, riptable.isnan,
+        riptable.isnotnan, riptable.isnanorzero, FastArray.isnan,
+        FastArray.isnotnan, FastArray.notna, FastArray.isnanorzero,
+        Categorical.isnan, Categorical.isnotnan, Categorical.notna
+        Dataset.mask_or_isnan :
+            Return a boolean array that's True for each `Dataset` row that contains
+            at least one NaN.
+        Dataset.mask_and_isnan :
+            Return a boolean array that's True for each all-NaN `Dataset` row.
+
+        Notes
+        -----
+        Riptable currently uses 0 for the DateTime NaN value. This constant is
+        held in the `DateTimeBase` class.
+
+        Examples
+        --------
+        >>> dtn = rt.DateTimeNano(['20210101 09:31:15', '20210519 05:21:17',
+        ...                        '20210713 02:44:19'], from_tz = 'NYC')
+        >>> dtn[0] = 0
+        >>> dtn[1] = dtn.inv
+        >>> dtn
+        DateTimeNano(['Inv', 'Inv', '20210712 22:44:19.000000000'], to_tz='NYC')
+        >>> dtn.isnotfinite()
+        FastArray([ True,  True, False])
+        """
+        return self._fa.isnanorzero()
+
     # -------------------------------------------------------------
     def _datetimenano_compare_check(self, funcname, other):
         caller = self._fa
@@ -6986,6 +7165,8 @@ class DateSpanScalar(np.int32):
 
     __slots__ = "_display_length"
 
+    NAN_DATESPANSCALAR = INVALID_DICT[np.dtype(np.int32).num]  # int32 sentinel
+
     # ------------------------------------------------------------
     def __new__(cls, arr, **kwargs):
         return super().__new__(cls, arr)
@@ -7023,6 +7204,22 @@ def __str__(self):
     def _np(self):
         return self.view(np.int32)
 
+    # ------------------------------------------------------------
+    def isnan(self):
+        return self == DateSpanScalar.NAN_DATESPANSCALAR
+
+    # ------------------------------------------------------------
+    def isnotnan(self):
+        return self != DateSpanScalar.NAN_DATESPANSCALAR
+
+    # ------------------------------------------------------------
+    def isfinite(self):
+        return self != DateSpanScalar.NAN_DATESPANSCALAR
+
+    # ------------------------------------------------------------
+    def isnotfinite(self):
+        return self == DateSpanScalar.NAN_DATESPANSCALAR
+
     # ------------------------------------------------------------
     @property
     def _fa(self):
@@ -7133,6 +7330,18 @@ def strftime(self, format):
     def isnan(self):
         return self <= 0
 
+    # ------------------------------------------------------------
+    def isnotnan(self):
+        return self > 0
+
+    # ------------------------------------------------------------
+    def isfinite(self):
+        return self > 0
+
+    # ------------------------------------------------------------
+    def isnotfinite(self):
+        return self <= 0
+
     # ------------------------------------------------------------
     @property
     def _np(self):
@@ -7262,6 +7471,22 @@ def strftime(self, format):
         """
         return self._strftime(format)
 
+    # ------------------------------------------------------------
+    def isnan(self):
+        return math.isnan(self)
+
+    # ------------------------------------------------------------
+    def isnotnan(self):
+        return not math.isnan(self)
+
+    # ------------------------------------------------------------
+    def isfinite(self):
+        return math.isfinite(self)
+
+    # ------------------------------------------------------------
+    def isnotfinite(self):
+        return not math.isfinite(self)
+
     # ------------------------------------------------------------
     def get_classname(self):
         return __class__.__name__

diff --git a/riptable/rt_groupbyops.py b/riptable/rt_groupbyops.py
@@ -957,6 +957,7 @@ def null(self, showfilter=False):
             TypeRegister.Dataset({}), self.gb_keychain, None, addkeys=True, showfilter=showfilter
         )
 
+    _USE_FAST_COUNT_UNIQUES = False  # re-enable/delete slow once riptable#209 is fixed.
     # ---------------------------------------------------------------
     def count_uniques(self, *args, **kwargs):
         """
@@ -989,19 +990,35 @@ def count_uniques(self, *args, **kwargs):
         label_keys = self.gb_keychain
         g = self.grouping
 
-        filter = kwargs["filter"] if "filter" in kwargs else None
-        transform = kwargs["transform"] if "transform" in kwargs else None
+        if GroupByOps._USE_FAST_COUNT_UNIQUES:
+            filter = kwargs["filter"] if "filter" in kwargs else None
+            transform = kwargs["transform"] if "transform" in kwargs else None
+
+            newdict = {}
+            for colname, arr in origdict.items():
+                gbk = label_keys.gbkeys
+                if colname not in gbk:
+                    mcat = TypeRegister.Categorical([self, arr], filter=filter)
+                    if transform:
+                        result = self.nansum(mcat.first_bool, transform=transform)[0]
+                    else:
+                        result = mcat.null()[0].count().Count
+                    newdict[colname] = result
+
+        else:  # slow code
+            # get way to make groups contiguous
+            igroup = g.igroup
+            cutoffs = g.ncountgroup.cumsum(dtype=np.int64)[1:]
+            newdict = {}
+            for colname, arr in origdict.items():
+                gbk = label_keys.gbkeys
+                if colname not in gbk:
+                    ifirstkey = groupbyhash(arr[igroup], cutoffs=cutoffs)["iFirstKey"][1]
+                    # the cutoffs will generate iFirstKey cutoffs that help us determine the unique counts
+                    result = ifirstkey.diff()
+                    result[0] = ifirstkey[0]
+                    newdict[colname] = result
 
-        newdict = {}
-        for colname, arr in origdict.items():
-            gbk = label_keys.gbkeys
-            if colname not in gbk:
-                mcat = TypeRegister.Categorical([self, arr], filter=filter)
-                if transform:
-                    result = self.nansum(mcat.first_bool, transform=transform)[0]
-                else:
-                    result = mcat.null()[0].count().Count
-                newdict[colname] = result
         return g._finalize_dataset(newdict, label_keys, None, addkeys=True, **kwargs)
 
     def _gb_keyword_wrapper(