Merge branch 'main' into fix/type_coercion_for_unobserved_categories

pandas-dev · Apr 20, 2024 · 73a6fef · 73a6fef
2 parents 30013ee + de1131f
commit 73a6fef
Show file tree

Hide file tree

Showing 48 changed files with 637 additions and 483 deletions.
diff --git a/.github/ISSUE_TEMPLATE/pdep_vote.yaml b/.github/ISSUE_TEMPLATE/pdep_vote.yaml
@@ -0,0 +1,74 @@
+name: PDEP Vote
+description: Call for a vote on a PDEP
+title: "VOTE: "
+labels: [Vote]
+
+body:
+  - type: markdown
+    attributes:
+      value: >
+        As per [PDEP-1](https://pandas.pydata.org/pdeps/0001-purpose-and-guidelines.html), the following issue template should be used when a
+        maintainer has opened a PDEP discussion and is ready to call for a vote.
+  - type: checkboxes
+    attributes:
+      label: Locked issue
+      options:
+        - label: >
+            I locked this voting issue so that only voting members are able to cast their votes or
+            comment on this issue.
+          required: true
+  - type: input
+    id: PDEP-name
+    attributes:
+      label: PDEP number and title
+      placeholder: >
+        PDEP-1: Purpose and guidelines
+    validations:
+      required: true
+  - type: input
+    id: PDEP-link
+    attributes:
+      label: Pull request with discussion
+      description: e.g. https://github.com/pandas-dev/pandas/pull/47444
+    validations:
+      required: true
+  - type: input
+    id: PDEP-rendered-link
+    attributes:
+      label: Rendered PDEP for easy reading
+      description: e.g. https://github.com/pandas-dev/pandas/pull/47444/files?short_path=7c449e6#diff-7c449e698132205b235c501f7e47ebba38da4d2b7f9492c98f16745dba787041
+    validations:
+      required: true
+  - type: input
+    id: PDEP-number-of-discussion-participants
+    attributes:
+      label: Discussion participants
+      description: >
+        You may find it useful to list or total the number of participating members in the
+        PDEP discussion PR. This would be the maximum possible disapprove votes.
+      placeholder: >
+        14 voting members participated in the PR discussion thus far.
+  - type: input
+    id: PDEP-vote-end
+    attributes:
+      label: Voting will close in 15 days.
+      description: The voting period end date. ('Voting will close in 15 days.' will be automatically written)
+  - type: markdown
+    attributes:
+      value: ---
+  - type: textarea
+    id: Vote
+    attributes:
+      label: Vote
+      value: |
+        Cast your vote in a comment below.
+        * +1: approve.
+        * 0: abstain.
+            * Reason: A one sentence reason is required.
+        * -1: disapprove
+            * Reason: A one sentence reason is required.
+        A disapprove vote requires prior participation in the linked discussion PR.
+
+        @pandas-dev/pandas-core
+    validations:
+      required: true
diff --git a/ci/code_checks.sh b/ci/code_checks.sh
@@ -81,22 +81,13 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.CategoricalIndex.ordered SA01" \
         -i "pandas.DataFrame.__dataframe__ SA01" \
         -i "pandas.DataFrame.__iter__ SA01" \
-        -i "pandas.DataFrame.assign SA01" \
         -i "pandas.DataFrame.at_time PR01" \
-        -i "pandas.DataFrame.bfill SA01" \
         -i "pandas.DataFrame.columns SA01" \
-        -i "pandas.DataFrame.copy SA01" \
         -i "pandas.DataFrame.droplevel SA01" \
-        -i "pandas.DataFrame.dtypes SA01" \
-        -i "pandas.DataFrame.ffill SA01" \
-        -i "pandas.DataFrame.first_valid_index SA01" \
-        -i "pandas.DataFrame.get SA01" \
         -i "pandas.DataFrame.hist RT03" \
         -i "pandas.DataFrame.infer_objects RT03" \
-        -i "pandas.DataFrame.keys SA01" \
         -i "pandas.DataFrame.kurt RT03,SA01" \
         -i "pandas.DataFrame.kurtosis RT03,SA01" \
-        -i "pandas.DataFrame.last_valid_index SA01" \
         -i "pandas.DataFrame.max RT03" \
         -i "pandas.DataFrame.mean RT03,SA01" \
         -i "pandas.DataFrame.median RT03,SA01" \
@@ -123,24 +114,18 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.DatetimeIndex.ceil SA01" \
         -i "pandas.DatetimeIndex.date SA01" \
         -i "pandas.DatetimeIndex.day SA01" \
-        -i "pandas.DatetimeIndex.day_name SA01" \
         -i "pandas.DatetimeIndex.day_of_year SA01" \
         -i "pandas.DatetimeIndex.dayofyear SA01" \
         -i "pandas.DatetimeIndex.floor SA01" \
         -i "pandas.DatetimeIndex.freqstr SA01" \
-        -i "pandas.DatetimeIndex.hour SA01" \
         -i "pandas.DatetimeIndex.indexer_at_time PR01,RT03" \
         -i "pandas.DatetimeIndex.indexer_between_time RT03" \
         -i "pandas.DatetimeIndex.inferred_freq SA01" \
         -i "pandas.DatetimeIndex.is_leap_year SA01" \
         -i "pandas.DatetimeIndex.microsecond SA01" \
-        -i "pandas.DatetimeIndex.minute SA01" \
-        -i "pandas.DatetimeIndex.month SA01" \
-        -i "pandas.DatetimeIndex.month_name SA01" \
         -i "pandas.DatetimeIndex.nanosecond SA01" \
         -i "pandas.DatetimeIndex.quarter SA01" \
         -i "pandas.DatetimeIndex.round SA01" \
-        -i "pandas.DatetimeIndex.second SA01" \
         -i "pandas.DatetimeIndex.snap PR01,RT03,SA01" \
         -i "pandas.DatetimeIndex.std PR01,RT03" \
         -i "pandas.DatetimeIndex.time SA01" \
@@ -149,11 +134,10 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.DatetimeIndex.to_pydatetime RT03,SA01" \
         -i "pandas.DatetimeIndex.tz SA01" \
         -i "pandas.DatetimeIndex.tz_convert RT03" \
-        -i "pandas.DatetimeIndex.year SA01" \
         -i "pandas.DatetimeTZDtype SA01" \
         -i "pandas.DatetimeTZDtype.tz SA01" \
         -i "pandas.DatetimeTZDtype.unit SA01" \
-        -i "pandas.Grouper PR02,SA01" \
+        -i "pandas.Grouper PR02" \
         -i "pandas.HDFStore.append PR01,SA01" \
         -i "pandas.HDFStore.get SA01" \
         -i "pandas.HDFStore.groups SA01" \
@@ -303,7 +287,6 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.add PR07" \
         -i "pandas.Series.at_time PR01" \
         -i "pandas.Series.backfill PR01,SA01" \
-        -i "pandas.Series.bfill SA01" \
         -i "pandas.Series.case_when RT03" \
         -i "pandas.Series.cat PR07,SA01" \
         -i "pandas.Series.cat.add_categories PR01,PR02" \
@@ -316,36 +299,31 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.cat.rename_categories PR01,PR02" \
         -i "pandas.Series.cat.reorder_categories PR01,PR02" \
         -i "pandas.Series.cat.set_categories PR01,PR02" \
-        -i "pandas.Series.copy SA01" \
         -i "pandas.Series.div PR07" \
         -i "pandas.Series.droplevel SA01" \
         -i "pandas.Series.dt.as_unit PR01,PR02" \
         -i "pandas.Series.dt.ceil PR01,PR02,SA01" \
         -i "pandas.Series.dt.components SA01" \
         -i "pandas.Series.dt.date SA01" \
         -i "pandas.Series.dt.day SA01" \
-        -i "pandas.Series.dt.day_name PR01,PR02,SA01" \
+        -i "pandas.Series.dt.day_name PR01,PR02" \
         -i "pandas.Series.dt.day_of_year SA01" \
         -i "pandas.Series.dt.dayofyear SA01" \
         -i "pandas.Series.dt.days SA01" \
         -i "pandas.Series.dt.days_in_month SA01" \
         -i "pandas.Series.dt.daysinmonth SA01" \
         -i "pandas.Series.dt.floor PR01,PR02,SA01" \
         -i "pandas.Series.dt.freq GL08" \
-        -i "pandas.Series.dt.hour SA01" \
         -i "pandas.Series.dt.is_leap_year SA01" \
         -i "pandas.Series.dt.microsecond SA01" \
         -i "pandas.Series.dt.microseconds SA01" \
-        -i "pandas.Series.dt.minute SA01" \
-        -i "pandas.Series.dt.month SA01" \
-        -i "pandas.Series.dt.month_name PR01,PR02,SA01" \
+        -i "pandas.Series.dt.month_name PR01,PR02" \
         -i "pandas.Series.dt.nanosecond SA01" \
         -i "pandas.Series.dt.nanoseconds SA01" \
         -i "pandas.Series.dt.normalize PR01" \
         -i "pandas.Series.dt.quarter SA01" \
         -i "pandas.Series.dt.qyear GL08" \
         -i "pandas.Series.dt.round PR01,PR02,SA01" \
-        -i "pandas.Series.dt.second SA01" \
         -i "pandas.Series.dt.seconds SA01" \
         -i "pandas.Series.dt.strftime PR01,PR02" \
         -i "pandas.Series.dt.time SA01" \
@@ -356,27 +334,20 @@ if [[ -z "$CHECK" || "$CHECK" == "docstrings" ]]; then
         -i "pandas.Series.dt.tz_convert PR01,PR02,RT03" \
         -i "pandas.Series.dt.tz_localize PR01,PR02" \
         -i "pandas.Series.dt.unit GL08" \
-        -i "pandas.Series.dt.year SA01" \
         -i "pandas.Series.dtype SA01" \
-        -i "pandas.Series.dtypes SA01" \
         -i "pandas.Series.empty GL08" \
         -i "pandas.Series.eq PR07,SA01" \
-        -i "pandas.Series.ffill SA01" \
-        -i "pandas.Series.first_valid_index SA01" \
         -i "pandas.Series.floordiv PR07" \
         -i "pandas.Series.ge PR07,SA01" \
-        -i "pandas.Series.get SA01" \
         -i "pandas.Series.gt PR07,SA01" \
         -i "pandas.Series.hasnans SA01" \
         -i "pandas.Series.infer_objects RT03" \
         -i "pandas.Series.is_monotonic_decreasing SA01" \
         -i "pandas.Series.is_monotonic_increasing SA01" \
         -i "pandas.Series.is_unique SA01" \
         -i "pandas.Series.item SA01" \
-        -i "pandas.Series.keys SA01" \
         -i "pandas.Series.kurt RT03,SA01" \
         -i "pandas.Series.kurtosis RT03,SA01" \
-        -i "pandas.Series.last_valid_index SA01" \
         -i "pandas.Series.le PR07,SA01" \
         -i "pandas.Series.list.__getitem__ SA01" \
         -i "pandas.Series.list.flatten SA01" \

diff --git a/doc/source/user_guide/io.rst b/doc/source/user_guide/io.rst
@@ -1949,13 +1949,6 @@ Writing in ISO date format, with microseconds:
    json = dfd.to_json(date_format="iso", date_unit="us")
    json
 
-Epoch timestamps, in seconds:
-
-.. ipython:: python
-
-   json = dfd.to_json(date_format="epoch", date_unit="s")
-   json
-
 Writing to a file, with a date index and a date column:
 
 .. ipython:: python
@@ -1965,7 +1958,7 @@ Writing to a file, with a date index and a date column:
    dfj2["ints"] = list(range(5))
    dfj2["bools"] = True
    dfj2.index = pd.date_range("20130101", periods=5)
-   dfj2.to_json("test.json")
+   dfj2.to_json("test.json", date_format="iso")
 
    with open("test.json") as fh:
        print(fh.read())
@@ -2140,7 +2133,7 @@ Dates written in nanoseconds need to be read back in nanoseconds:
 .. ipython:: python
 
    from io import StringIO
-   json = dfj2.to_json(date_unit="ns")
+   json = dfj2.to_json(date_format="iso", date_unit="ns")
 
    # Try to parse timestamps as milliseconds -> Won't Work
    dfju = pd.read_json(StringIO(json), date_unit="ms")

diff --git a/doc/source/whatsnew/v3.0.0.rst b/doc/source/whatsnew/v3.0.0.rst
@@ -37,6 +37,7 @@ Other enhancements
 - Support reading value labels from Stata 108-format (Stata 6) and earlier files (:issue:`58154`)
 - Users can globally disable any ``PerformanceWarning`` by setting the option ``mode.performance_warnings`` to ``False`` (:issue:`56920`)
 - :meth:`Styler.format_index_names` can now be used to format the index and column names (:issue:`48936` and :issue:`47489`)
+- :class:`.errors.DtypeWarning` improved to include column names when mixed data types are detected (:issue:`58174`)
 - :meth:`DataFrame.cummin`, :meth:`DataFrame.cummax`, :meth:`DataFrame.cumprod` and :meth:`DataFrame.cumsum` methods now have a ``numeric_only`` parameter (:issue:`53072`)
 - :meth:`DataFrame.fillna` and :meth:`Series.fillna` can now accept ``value=None``; for non-object dtype the corresponding NA value will be used (:issue:`57723`)
 
@@ -195,6 +196,7 @@ Other Deprecations
 - Deprecated allowing non-keyword arguments in :meth:`DataFrame.all`, :meth:`DataFrame.min`, :meth:`DataFrame.max`, :meth:`DataFrame.sum`, :meth:`DataFrame.prod`, :meth:`DataFrame.mean`, :meth:`DataFrame.median`, :meth:`DataFrame.sem`, :meth:`DataFrame.var`, :meth:`DataFrame.std`, :meth:`DataFrame.skew`, :meth:`DataFrame.kurt`, :meth:`Series.all`,  :meth:`Series.min`, :meth:`Series.max`, :meth:`Series.sum`, :meth:`Series.prod`, :meth:`Series.mean`, :meth:`Series.median`, :meth:`Series.sem`, :meth:`Series.var`, :meth:`Series.std`, :meth:`Series.skew`, and :meth:`Series.kurt`. (:issue:`57087`)
 - Deprecated allowing non-keyword arguments in :meth:`Series.to_markdown` except ``buf``. (:issue:`57280`)
 - Deprecated allowing non-keyword arguments in :meth:`Series.to_string` except ``buf``. (:issue:`57280`)
+- Deprecated using ``epoch`` date format in :meth:`DataFrame.to_json` and :meth:`Series.to_json`, use ``iso`` instead. (:issue:`57063`)
 -
 
 .. ---------------------------------------------------------------------------
@@ -204,6 +206,7 @@ Removal of prior version deprecations/changes
 ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 - :class:`.DataFrameGroupBy.idxmin`, :class:`.DataFrameGroupBy.idxmax`, :class:`.SeriesGroupBy.idxmin`, and :class:`.SeriesGroupBy.idxmax` will now raise a ``ValueError`` when used with ``skipna=False`` and an NA value is encountered (:issue:`10694`)
 - :func:`concat` no longer ignores empty objects when determining output dtypes (:issue:`39122`)
+- :func:`concat` with all-NA entries no longer ignores the dtype of those entries when determining the result dtype (:issue:`40893`)
 - :func:`read_excel`, :func:`read_json`, :func:`read_html`, and :func:`read_xml` no longer accept raw string or byte representation of the data. That type of data must be wrapped in a :py:class:`StringIO` or :py:class:`BytesIO` (:issue:`53767`)
 - :meth:`DataFrame.groupby` with ``as_index=False`` and aggregation methods will no longer exclude from the result the groupings that do not arise from the input (:issue:`49519`)
 - :meth:`Series.dt.to_pydatetime` now returns a :class:`Series` of :py:class:`datetime.datetime` objects (:issue:`52459`)
@@ -219,6 +222,7 @@ Removal of prior version deprecations/changes
 - Disallow units other than "s", "ms", "us", "ns" for datetime64 and timedelta64 dtypes in :func:`array` (:issue:`53817`)
 - Removed "freq" keyword from :class:`PeriodArray` constructor, use "dtype" instead (:issue:`52462`)
 - Removed 'fastpath' keyword in :class:`Categorical` constructor (:issue:`20110`)
+- Removed 'kind' keyword in :meth:`Series.resample` and :meth:`DataFrame.resample` (:issue:`58125`)
 - Removed alias :class:`arrays.PandasArray` for :class:`arrays.NumpyExtensionArray` (:issue:`53694`)
 - Removed deprecated "method" and "limit" keywords from :meth:`Series.replace` and :meth:`DataFrame.replace` (:issue:`53492`)
 - Removed extension test classes ``BaseNoReduceTests``, ``BaseNumericReduceTests``, ``BaseBooleanReduceTests`` (:issue:`54663`)
@@ -331,6 +335,7 @@ Performance improvements
 - Performance improvement in :meth:`RangeIndex.reindex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57647`, :issue:`57752`)
 - Performance improvement in :meth:`RangeIndex.take` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57445`, :issue:`57752`)
 - Performance improvement in :func:`merge` if hash-join can be used (:issue:`57970`)
+- Performance improvement in :meth:`to_hdf` avoid unnecessary reopenings of the HDF5 file to speedup data addition to files with a very large number of groups . (:issue:`58248`)
 - Performance improvement in ``DataFrameGroupBy.__len__`` and ``SeriesGroupBy.__len__`` (:issue:`57595`)
 - Performance improvement in indexing operations for string dtypes (:issue:`56997`)
 - Performance improvement in unary methods on a :class:`RangeIndex` returning a :class:`RangeIndex` instead of a :class:`Index` when possible. (:issue:`57825`)
@@ -386,7 +391,7 @@ Interval
 
 Indexing
 ^^^^^^^^
--
+- Bug in :meth:`DataFrame.__getitem__` returning modified columns when called with ``slice`` in Python 3.12 (:issue:`57500`)
 -
 
 Missing
@@ -396,7 +401,7 @@ Missing
 
 MultiIndex
 ^^^^^^^^^^
--
+- :func:`DataFrame.loc` with ``axis=0``  and :class:`MultiIndex` when setting a value adds extra columns (:issue:`58116`)
 -
 
 I/O
@@ -406,7 +411,6 @@ I/O
 - Bug in :meth:`DataFrame.to_string` that raised ``StopIteration`` with nested DataFrames. (:issue:`16098`)
 - Bug in :meth:`read_csv` raising ``TypeError`` when ``index_col`` is specified and ``na_values`` is a dict containing the key ``None``. (:issue:`57547`)
 
-
 Period
 ^^^^^^
 -
@@ -415,6 +419,7 @@ Period
 Plotting
 ^^^^^^^^
 - Bug in  :meth:`.DataFrameGroupBy.boxplot` failed when there were multiple groupings (:issue:`14701`)
+- Bug in :meth:`DataFrame.plot` that causes a shift to the right when the frequency multiplier is greater than one. (:issue:`57587`)
 -
 
 Groupby/resample/rolling

diff --git a/environment.yml b/environment.yml
@@ -89,6 +89,7 @@ dependencies:
   - numpydoc
   - pydata-sphinx-theme=0.14
   - pytest-cython  # doctest
+  - docutils < 0.21  # https://github.com/sphinx-doc/sphinx/issues/12302
   - sphinx
   - sphinx-design
   - sphinx-copybutton