diff --git a/doc/source/whatsnew/v1.0.0.rst b/doc/source/whatsnew/v1.0.0.rst index 051d64ee87711..8c59ed0dd9388 100755 --- a/doc/source/whatsnew/v1.0.0.rst +++ b/doc/source/whatsnew/v1.0.0.rst @@ -290,13 +290,42 @@ New repr for :class:`~pandas.arrays.IntervalArray` closed='right', dtype='interval[int64]') - *pandas 1.0.0* .. ipython:: python pd.arrays.IntervalArray.from_tuples([(0, 1), (2, 3)]) +Extended verbose info output for :class:`~pandas.DataFrame` +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- :meth:`Dataframe.info` now shows line numbers for the columns summary (:issue:`17304`) + +*pandas 0.25.x* + +.. code-block:: python + + >>> df = pd.DataFrame({"int_col": [1, 2, 3], + ... "text_col": ["a", "b", "c"], + ... "float_col": [0.0, 0.1, 0.2]}) + >>> df.info(verbose=True) + + RangeIndex: 3 entries, 0 to 2 + Data columns (total 3 columns): + int_col 3 non-null int64 + text_col 3 non-null object + float_col 3 non-null float64 + dtypes: float64(1), int64(1), object(1) + memory usage: 152.0+ bytes + +*pandas 1.0.0* + +.. ipython:: python + + df = pd.DataFrame({"int_col": [1, 2, 3], + "text_col": ["a", "b", "c"], + "float_col": [0.0, 0.1, 0.2]}) + df.info(verbose=True) All :class:`SeriesGroupBy` aggregation methods now respect the ``observed`` keyword ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ diff --git a/pandas/core/frame.py b/pandas/core/frame.py index b69199defbcc4..8bc417acaf7f3 100644 --- a/pandas/core/frame.py +++ b/pandas/core/frame.py @@ -2276,9 +2276,11 @@ def info( RangeIndex: 5 entries, 0 to 4 Data columns (total 3 columns): - int_col 5 non-null int64 - text_col 5 non-null object - float_col 5 non-null float64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 int_col 5 non-null int64 + 1 text_col 5 non-null object + 2 float_col 5 non-null float64 dtypes: float64(1), int64(1), object(1) memory usage: 248.0+ bytes @@ -2317,9 +2319,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 22.9+ MB @@ -2327,9 +2331,11 @@ def info( RangeIndex: 1000000 entries, 0 to 999999 Data columns (total 3 columns): - column_1 1000000 non-null object - column_2 1000000 non-null object - column_3 1000000 non-null object + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 column_1 1000000 non-null object + 1 column_2 1000000 non-null object + 2 column_3 1000000 non-null object dtypes: object(3) memory usage: 188.8 MB """ @@ -2348,6 +2354,7 @@ def info( return cols = self.columns + col_count = len(self.columns) # hack if max_cols is None: @@ -2356,36 +2363,76 @@ def info( max_rows = get_option("display.max_info_rows", len(self) + 1) if null_counts is None: - show_counts = (len(self.columns) <= max_cols) and (len(self) < max_rows) + show_counts = (col_count <= max_cols) and (len(self) < max_rows) else: show_counts = null_counts - exceeds_info_cols = len(self.columns) > max_cols + exceeds_info_cols = col_count > max_cols def _verbose_repr(): lines.append(f"Data columns (total {len(self.columns)} columns):") - space = max(len(pprint_thing(k)) for k in self.columns) + 4 + + id_head = " # " + column_head = "Column" + col_space = 2 + + max_col = max(len(pprint_thing(k)) for k in cols) + len_column = len(pprint_thing(column_head)) + space = max(max_col, len_column) + col_space + + max_id = len(pprint_thing(col_count)) + len_id = len(pprint_thing(id_head)) + space_num = max(max_id, len_id) + col_space counts = None - tmpl = "{count}{dtype}" + header = _put_str(id_head, space_num) + _put_str(column_head, space) if show_counts: counts = self.count() if len(cols) != len(counts): # pragma: no cover raise AssertionError( f"Columns must equal counts ({len(cols)} != {len(counts)})" ) - tmpl = "{count} non-null {dtype}" + count_header = "Non-Null Count" + len_count = len(count_header) + non_null = " non-null" + max_count = max(len(pprint_thing(k)) for k in counts) + len(non_null) + space_count = max(len_count, max_count) + col_space + count_temp = "{count}" + non_null + else: + count_header = "" + space_count = len(count_header) + len_count = space_count + count_temp = "{count}" + + dtype_header = "Dtype" + len_dtype = len(dtype_header) + max_dtypes = max(len(pprint_thing(k)) for k in self.dtypes) + space_dtype = max(len_dtype, max_dtypes) + header += _put_str(count_header, space_count) + _put_str( + dtype_header, space_dtype + ) + + lines.append(header) + lines.append( + _put_str("-" * len_id, space_num) + + _put_str("-" * len_column, space) + + _put_str("-" * len_count, space_count) + + _put_str("-" * len_dtype, space_dtype) + ) - dtypes = self.dtypes for i, col in enumerate(self.columns): - dtype = dtypes.iloc[i] + dtype = self.dtypes.iloc[i] col = pprint_thing(col) + line_no = _put_str(" {num}".format(num=i), space_num) count = "" if show_counts: count = counts.iloc[i] lines.append( - _put_str(col, space) + tmpl.format(count=count, dtype=dtype) + line_no + + _put_str(col, space) + + _put_str(count_temp.format(count=count), space_count) + + _put_str(dtype, space_dtype) ) def _non_verbose_repr(): diff --git a/pandas/tests/frame/test_repr_info.py b/pandas/tests/frame/test_repr_info.py index 60dce36312145..91610102cf0f9 100644 --- a/pandas/tests/frame/test_repr_info.py +++ b/pandas/tests/frame/test_repr_info.py @@ -205,6 +205,28 @@ def test_info(self, float_frame, datetime_frame): frame.info() frame.info(verbose=False) + def test_info_verbose(self): + buf = StringIO() + size = 1001 + start = 5 + frame = DataFrame(np.random.randn(3, size)) + frame.info(verbose=True, buf=buf) + + res = buf.getvalue() + header = " # Column Dtype \n--- ------ ----- " + assert header in res + + frame.info(verbose=True, buf=buf) + buf.seek(0) + lines = buf.readlines() + assert len(lines) > 0 + + for i, line in enumerate(lines): + if i >= start and i < start + size: + index = i - start + line_nr = " {} ".format(index) + assert line.startswith(line_nr) + def test_info_memory(self): # https://github.com/pandas-dev/pandas/issues/21056 df = pd.DataFrame({"a": pd.Series([1, 2], dtype="i8")}) @@ -218,7 +240,9 @@ def test_info_memory(self): RangeIndex: 2 entries, 0 to 1 Data columns (total 1 columns): - a 2 non-null int64 + # Column Non-Null Count Dtype + --- ------ -------------- ----- + 0 a 2 non-null int64 dtypes: int64(1) memory usage: {} bytes """.format( @@ -262,8 +286,8 @@ def test_info_duplicate_columns_shows_correct_dtypes(self): frame.info(buf=io) io.seek(0) lines = io.readlines() - assert "a 1 non-null int64\n" == lines[3] - assert "a 1 non-null float64\n" == lines[4] + assert " 0 a 1 non-null int64 \n" == lines[5] + assert " 1 a 1 non-null float64\n" == lines[6] def test_info_shows_column_dtypes(self): dtypes = [ @@ -283,13 +307,20 @@ def test_info_shows_column_dtypes(self): buf = StringIO() df.info(buf=buf) res = buf.getvalue() + header = ( + " # Column Non-Null Count Dtype \n" + "--- ------ -------------- ----- " + ) + assert header in res for i, dtype in enumerate(dtypes): - name = "{i:d} {n:d} non-null {dtype}".format(i=i, n=n, dtype=dtype) + name = " {i:d} {i:d} {n:d} non-null {dtype}".format( + i=i, n=n, dtype=dtype + ) assert name in res def test_info_max_cols(self): df = DataFrame(np.random.randn(10, 5)) - for len_, verbose in [(5, None), (5, False), (10, True)]: + for len_, verbose in [(5, None), (5, False), (12, True)]: # For verbose always ^ setting ^ summarize ^ full output with option_context("max_info_columns", 4): buf = StringIO() @@ -297,16 +328,16 @@ def test_info_max_cols(self): res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, verbose in [(10, None), (5, False), (10, True)]: + for len_, verbose in [(12, None), (5, False), (12, True)]: - # max_cols no exceeded + # max_cols not exceeded with option_context("max_info_columns", 5): buf = StringIO() df.info(buf=buf, verbose=verbose) res = buf.getvalue() assert len(res.strip().split("\n")) == len_ - for len_, max_cols in [(10, 5), (5, 4)]: + for len_, max_cols in [(12, 5), (5, 4)]: # setting truncates with option_context("max_info_columns", 4): buf = StringIO()