From c4e1c186d7fa0cb2800d6a3ab430ab916a7a1310 Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Tue, 24 Jun 2025 14:10:47 -0700 Subject: [PATCH 1/9] slack link update --- doc/source/development/community.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/development/community.rst b/doc/source/development/community.rst index 1c698d130ea6c..e139ea0376771 100644 --- a/doc/source/development/community.rst +++ b/doc/source/development/community.rst @@ -114,7 +114,7 @@ people who are hesitant to bring up their questions or ideas on a large public mailing list or GitHub. If this sounds like the right place for you, you are welcome to join using -`this link `_! +`this link `_! Please remember to follow our `Code of Conduct `_, and be aware that our admins are monitoring for irrelevant messages and will remove folks who use our From 921695444d0d53cff1517d8edf904474b6f1c246 Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Fri, 8 Aug 2025 20:06:29 -0700 Subject: [PATCH 2/9] object --- doc/source/user_guide/categorical.rst | 29 ++++++++++++++++++++++ pandas/core/arrays/categorical.py | 13 +++++++--- pandas/tests/extension/test_categorical.py | 25 +++++++++++++++++++ 3 files changed, 64 insertions(+), 3 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 1e7d66dfeb142..51d6fd4a9e3ad 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1178,3 +1178,32 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica This also happens in some cases when you supply a NumPy array instead of a ``Categorical``: using an int array (e.g. ``np.array([1,2,3,4])``) will exhibit the same behavior, while using a string array (e.g. ``np.array(["a","b","c","a"])``) will not. + +.. note:: + + When constructing a :class:`pandas.Categorical` from a pandas :class:`Series` or + :class:`Index` with ``dtype='object'``, the dtype of the categories will be + preserved as ``object``. When constructing from a NumPy array + with ``dtype='object'`` or a raw Python sequence, pandas will infer the most + specific dtype for the categories (for example, ``str`` if all elements are strings). + +.. ipython:: python + + pd.options.future.infer_string = True + ser = pd.Series(["foo", "bar", "baz"], dtype="object") + idx = pd.Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] + + cat_from_ser = pd.Categorical(ser) + cat_from_idx = pd.Categorical(idx) + cat_from_arr = pd.Categorical(arr) + cat_from_list = pd.Categorical(pylist) + + # Series/Index with object dtype: preserve object dtype + assert cat_from_ser.categories.dtype == "object" + assert cat_from_idx.categories.dtype == "object" + + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index d57856115d276..fa550a7f46617 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -457,6 +457,11 @@ def __init__( codes = arr.indices.to_numpy() dtype = CategoricalDtype(categories, values.dtype.pyarrow_dtype.ordered) else: + # Check for pandas Series/ Index with object dtye + preserve_object_dtpe = False + if isinstance(values, (ABCSeries, ABCIndex)): + if getattr(values.dtype, "name", None) == "object": + preserve_object_dtpe = True if not isinstance(values, ABCIndex): # in particular RangeIndex xref test_index_equal_range_categories values = sanitize_array(values, None) @@ -465,15 +470,17 @@ def __init__( except TypeError as err: codes, categories = factorize(values, sort=False) if dtype.ordered: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories raise TypeError( "'values' is not ordered, please " "explicitly specify the categories order " "by passing in a categories argument." ) from err - # we're inferring from values + # If we should prserve object dtype, force categories to object dtype + if preserve_object_dtpe: + from pandas import Index + + categories = Index(categories, dtype=object, copy=False) dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 8f8af607585df..5a519a261b029 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -180,6 +180,31 @@ def test_array_repr(self, data, size): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) + def test_categorical_preserve_object_dtype_from_pandas(self): + import numpy as np + + import pandas as pd + + pd.options.future.infer_string = True + + ser = pd.Series(["foo", "bar", "baz"], dtype="object") + idx = pd.Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] + + cat_from_ser = Categorical(ser) + cat_from_idx = Categorical(idx) + cat_from_arr = Categorical(arr) + cat_from_list = Categorical(pylist) + + # Series/Index with object dtype: preserve object dtype + assert cat_from_ser.categories.dtype == "object" + assert cat_from_idx.categories.dtype == "object" + + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" + class Test2DCompat(base.NDArrayBacked2DTests): def test_repr_2d(self, data): From 8f460acc845be6726582995ca7714c73f764d77f Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Fri, 8 Aug 2025 20:13:29 -0700 Subject: [PATCH 3/9] whatsnew --- doc/source/whatsnew/v3.0.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst index 3191c077d3c36..5501d3fa8b08e 100644 --- a/doc/source/whatsnew/v3.0.0.rst +++ b/doc/source/whatsnew/v3.0.0.rst @@ -690,7 +690,7 @@ Categorical - Bug in :meth:`Categorical.astype` where ``copy=False`` would still trigger a copy of the codes (:issue:`62000`) - Bug in :meth:`DataFrame.pivot` and :meth:`DataFrame.set_index` raising an ``ArrowNotImplementedError`` for columns with pyarrow dictionary dtype (:issue:`53051`) - Bug in :meth:`Series.convert_dtypes` with ``dtype_backend="pyarrow"`` where empty :class:`CategoricalDtype` :class:`Series` raised an error or got converted to ``null[pyarrow]`` (:issue:`59934`) -- +- Bug in :class:`Categorical` where constructing from a pandas :class:`Series` or :class:`Index` with ``dtype='object'`` did not preserve the categories' dtype as ``object``; now the dtype is preserved as ``object`` for these cases, while numpy arrays and Python sequences with ``dtype='object'`` continue to infer the most specific dtype (for example, ``str`` if all elements are strings). Datetimelike ^^^^^^^^^^^^ From 87a54feb69f380c911290da89ececd6660ad867b Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Wed, 27 Aug 2025 22:51:54 -0700 Subject: [PATCH 4/9] some comments --- pandas/core/arrays/categorical.py | 2 ++ .../arrays/categorical/test_constructors.py | 20 +++++++++++++++ pandas/tests/extension/test_categorical.py | 25 ------------------- 3 files changed, 22 insertions(+), 25 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index fa550a7f46617..2847da71a17d0 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -468,6 +468,8 @@ def __init__( try: codes, categories = factorize(values, sort=True) except TypeError as err: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories codes, categories = factorize(values, sort=False) if dtype.ordered: raise TypeError( diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index cf2de894cc0c0..dc68f8abe234a 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -786,3 +786,23 @@ def test_range_values_preserves_rangeindex_categories(self, values, categories): result = Categorical(values=values, categories=categories).categories expected = RangeIndex(range(5)) tm.assert_index_equal(result, expected, exact=True) + + def test_categorical_preserve_object_dtype_from_pandas(self): + with pd.option_context("future.infer_string", True): + ser = Series(["foo", "bar", "baz"], dtype="object") + idx = Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] + + cat_from_ser = Categorical(ser) + cat_from_idx = Categorical(idx) + cat_from_arr = Categorical(arr) + cat_from_list = Categorical(pylist) + + # Series/Index with object dtype: preserve object dtype + assert cat_from_ser.categories.dtype == "object" + assert cat_from_idx.categories.dtype == "object" + + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.dtype == "str" + assert cat_from_list.categories.dtype == "str" diff --git a/pandas/tests/extension/test_categorical.py b/pandas/tests/extension/test_categorical.py index 5a519a261b029..8f8af607585df 100644 --- a/pandas/tests/extension/test_categorical.py +++ b/pandas/tests/extension/test_categorical.py @@ -180,31 +180,6 @@ def test_array_repr(self, data, size): def test_groupby_extension_agg(self, as_index, data_for_grouping): super().test_groupby_extension_agg(as_index, data_for_grouping) - def test_categorical_preserve_object_dtype_from_pandas(self): - import numpy as np - - import pandas as pd - - pd.options.future.infer_string = True - - ser = pd.Series(["foo", "bar", "baz"], dtype="object") - idx = pd.Index(["foo", "bar", "baz"], dtype="object") - arr = np.array(["foo", "bar", "baz"], dtype="object") - pylist = ["foo", "bar", "baz"] - - cat_from_ser = Categorical(ser) - cat_from_idx = Categorical(idx) - cat_from_arr = Categorical(arr) - cat_from_list = Categorical(pylist) - - # Series/Index with object dtype: preserve object dtype - assert cat_from_ser.categories.dtype == "object" - assert cat_from_idx.categories.dtype == "object" - - # Numpy array or list: infer string dtype - assert cat_from_arr.categories.dtype == "str" - assert cat_from_list.categories.dtype == "str" - class Test2DCompat(base.NDArrayBacked2DTests): def test_repr_2d(self, data): From cddc5746e19b3e864fe36d2ca470b4f660dd508d Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Wed, 27 Aug 2025 23:08:35 -0700 Subject: [PATCH 5/9] comment restore --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index 2847da71a17d0..bee3caa0e4a84 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -468,10 +468,10 @@ def __init__( try: codes, categories = factorize(values, sort=True) except TypeError as err: - # raise, as we don't have a sortable data structure and so - # the user should give us one by specifying categories codes, categories = factorize(values, sort=False) if dtype.ordered: + # raise, as we don't have a sortable data structure and so + # the user should give us one by specifying categories raise TypeError( "'values' is not ordered, please " "explicitly specify the categories order " From e83e4f9ccbc0dfc253ebfb264c2eb52a5d3acc4c Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Thu, 28 Aug 2025 00:27:29 -0700 Subject: [PATCH 6/9] assertionerror fix --- pandas/core/arrays/categorical.py | 6 ++++-- .../arrays/categorical/test_constructors.py | 19 ++++++++++++++----- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index bee3caa0e4a84..cc8f6cfa8ef41 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -480,9 +480,11 @@ def __init__( # If we should prserve object dtype, force categories to object dtype if preserve_object_dtpe: - from pandas import Index + # Only preserve object dtype if not all elements are strings + if not all(isinstance(x, str) for x in categories): + from pandas import Index - categories = Index(categories, dtype=object, copy=False) + categories = Index(categories, dtype=object, copy=False) dtype = CategoricalDtype(categories, dtype.ordered) elif isinstance(values.dtype, CategoricalDtype): diff --git a/pandas/tests/arrays/categorical/test_constructors.py b/pandas/tests/arrays/categorical/test_constructors.py index dc68f8abe234a..d16daf76304a0 100644 --- a/pandas/tests/arrays/categorical/test_constructors.py +++ b/pandas/tests/arrays/categorical/test_constructors.py @@ -799,10 +799,19 @@ def test_categorical_preserve_object_dtype_from_pandas(self): cat_from_arr = Categorical(arr) cat_from_list = Categorical(pylist) - # Series/Index with object dtype: preserve object dtype - assert cat_from_ser.categories.dtype == "object" - assert cat_from_idx.categories.dtype == "object" + # Series/Index with object dtype: infer string + # dtype if all elements are strings + assert cat_from_ser.categories.inferred_type == "string" + assert cat_from_idx.categories.inferred_type == "string" # Numpy array or list: infer string dtype - assert cat_from_arr.categories.dtype == "str" - assert cat_from_list.categories.dtype == "str" + assert cat_from_arr.categories.inferred_type == "string" + assert cat_from_list.categories.inferred_type == "string" + + # Mixed types: preserve object dtype + ser_mixed = Series(["foo", 1, None], dtype="object") + idx_mixed = Index(["foo", 1, None], dtype="object") + cat_mixed_ser = Categorical(ser_mixed) + cat_mixed_idx = Categorical(idx_mixed) + assert cat_mixed_ser.categories.dtype == "object" + assert cat_mixed_idx.categories.dtype == "object" From 5ed039a779bd39ff2c75434c27652727a8d45fbf Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Thu, 28 Aug 2025 01:01:23 -0700 Subject: [PATCH 7/9] rst changes --- doc/source/user_guide/categorical.rst | 41 ++++++++++++++++----------- 1 file changed, 25 insertions(+), 16 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index 51d6fd4a9e3ad..b6d70e87b95b2 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1185,25 +1185,34 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica :class:`Index` with ``dtype='object'``, the dtype of the categories will be preserved as ``object``. When constructing from a NumPy array with ``dtype='object'`` or a raw Python sequence, pandas will infer the most - specific dtype for the categories (for example, ``str`` if all elements are strings). + specific dtype for the categories (for example, ``string`` if all elements are strings). .. ipython:: python - pd.options.future.infer_string = True - ser = pd.Series(["foo", "bar", "baz"], dtype="object") - idx = pd.Index(["foo", "bar", "baz"], dtype="object") - arr = np.array(["foo", "bar", "baz"], dtype="object") - pylist = ["foo", "bar", "baz"] + with pd.option_context("future.infer_string", True): + ser = Series(["foo", "bar", "baz"], dtype="object") + idx = Index(["foo", "bar", "baz"], dtype="object") + arr = np.array(["foo", "bar", "baz"], dtype="object") + pylist = ["foo", "bar", "baz"] - cat_from_ser = pd.Categorical(ser) - cat_from_idx = pd.Categorical(idx) - cat_from_arr = pd.Categorical(arr) - cat_from_list = pd.Categorical(pylist) + cat_from_ser = Categorical(ser) + cat_from_idx = Categorical(idx) + cat_from_arr = Categorical(arr) + cat_from_list = Categorical(pylist) - # Series/Index with object dtype: preserve object dtype - assert cat_from_ser.categories.dtype == "object" - assert cat_from_idx.categories.dtype == "object" + # Series/Index with object dtype: infer string + # dtype if all elements are strings + assert cat_from_ser.categories.inferred_type == "string" + assert cat_from_idx.categories.inferred_type == "string" - # Numpy array or list: infer string dtype - assert cat_from_arr.categories.dtype == "str" - assert cat_from_list.categories.dtype == "str" + # Numpy array or list: infer string dtype + assert cat_from_arr.categories.inferred_type == "string" + assert cat_from_list.categories.inferred_type == "string" + + # Mixed types: preserve object dtype + ser_mixed = Series(["foo", 1, None], dtype="object") + idx_mixed = Index(["foo", 1, None], dtype="object") + cat_mixed_ser = Categorical(ser_mixed) + cat_mixed_idx = Categorical(idx_mixed) + assert cat_mixed_ser.categories.dtype == "object" + assert cat_mixed_idx.categories.dtype == "object" From 9b4b2d91bc87c8be7695bf3be12cb626bd4db886 Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Thu, 28 Aug 2025 01:16:26 -0700 Subject: [PATCH 8/9] rst import error --- doc/source/user_guide/categorical.rst | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/doc/source/user_guide/categorical.rst b/doc/source/user_guide/categorical.rst index b6d70e87b95b2..73b252929ea72 100644 --- a/doc/source/user_guide/categorical.rst +++ b/doc/source/user_guide/categorical.rst @@ -1190,18 +1190,17 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica .. ipython:: python with pd.option_context("future.infer_string", True): - ser = Series(["foo", "bar", "baz"], dtype="object") - idx = Index(["foo", "bar", "baz"], dtype="object") + ser = pd.Series(["foo", "bar", "baz"], dtype="object") + idx = pd.Index(["foo", "bar", "baz"], dtype="object") arr = np.array(["foo", "bar", "baz"], dtype="object") pylist = ["foo", "bar", "baz"] - cat_from_ser = Categorical(ser) - cat_from_idx = Categorical(idx) - cat_from_arr = Categorical(arr) - cat_from_list = Categorical(pylist) + cat_from_ser = pd.Categorical(ser) + cat_from_idx = pd.Categorical(idx) + cat_from_arr = pd.Categorical(arr) + cat_from_list = pd.Categorical(pylist) - # Series/Index with object dtype: infer string - # dtype if all elements are strings + # Series/Index with object dtype: infer string dtype assert cat_from_ser.categories.inferred_type == "string" assert cat_from_idx.categories.inferred_type == "string" @@ -1210,9 +1209,9 @@ Use ``copy=True`` to prevent such a behaviour or simply don't reuse ``Categorica assert cat_from_list.categories.inferred_type == "string" # Mixed types: preserve object dtype - ser_mixed = Series(["foo", 1, None], dtype="object") - idx_mixed = Index(["foo", 1, None], dtype="object") - cat_mixed_ser = Categorical(ser_mixed) - cat_mixed_idx = Categorical(idx_mixed) + ser_mixed = pd.Series(["foo", 1, None], dtype="object") + idx_mixed = pd.Index(["foo", 1, None], dtype="object") + cat_mixed_ser = pd.Categorical(ser_mixed) + cat_mixed_idx = pd.Categorical(idx_mixed) assert cat_mixed_ser.categories.dtype == "object" assert cat_mixed_idx.categories.dtype == "object" From 1b81162ee3140f80f8686ceaf30358d5ee0d343b Mon Sep 17 00:00:00 2001 From: Niruta Talwekar Date: Sun, 5 Oct 2025 00:34:38 -0700 Subject: [PATCH 9/9] change condition --- pandas/core/arrays/categorical.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pandas/core/arrays/categorical.py b/pandas/core/arrays/categorical.py index de0c7fba18b46..224659a0aa699 100644 --- a/pandas/core/arrays/categorical.py +++ b/pandas/core/arrays/categorical.py @@ -457,7 +457,7 @@ def __init__( # Check for pandas Series/ Index with object dtye preserve_object_dtpe = False if isinstance(values, (ABCSeries, ABCIndex)): - if getattr(values.dtype, "name", None) == "object": + if values.dtype == "object": preserve_object_dtpe = True if not isinstance(values, ABCIndex): # in particular RangeIndex xref test_index_equal_range_categories @@ -475,7 +475,7 @@ def __init__( "by passing in a categories argument." ) from err - # If we should prserve object dtype, force categories to object dtype + # If we should preserve object dtype, force categories to object dtype if preserve_object_dtpe: # Only preserve object dtype if not all elements are strings if not all(isinstance(x, str) for x in categories):