From 15d1d1ef1f418a8751d43276fb66aa30a9b15b10 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Mon, 4 May 2020 10:34:01 +0530 Subject: [PATCH 01/25] ENH Add nrow parameter for line delimited json for read_json #33916 --- pandas/io/json/_json.py | 24 +++++++++++++++++++++++- pandas/tests/io/json/test_readlines.py | 26 ++++++++++++++++++++++++++ 2 files changed, 49 insertions(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ac6f9ff372601..3435ca8dddafb 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -363,6 +363,7 @@ def read_json( lines=False, chunksize=None, compression="infer", + nrows=None, ): """ Convert a JSON string to pandas object. @@ -493,6 +494,12 @@ def read_json( for more information on ``chunksize``. This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. + + chunksize : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -600,6 +607,7 @@ def read_json( lines=lines, chunksize=chunksize, compression=compression, + nrows=nrows, ) if chunksize: @@ -637,6 +645,7 @@ def __init__( lines, chunksize, compression, + nrows ): self.path_or_buf = filepath_or_buffer @@ -655,11 +664,16 @@ def __init__( self.chunksize = chunksize self.nrows_seen = 0 self.should_close = False + self.nrows = nrows if self.chunksize is not None: self.chunksize = _validate_integer("chunksize", self.chunksize, 1) if not self.lines: raise ValueError("chunksize can only be passed if lines=True") + if self.nrows is not None: + self.chunksize = _validate_integer("nrows", self.nrows, 0) + if not self.lines: + raise ValueError("nrows can only be passed if lines=True") data = self._get_data_from_filepath(filepath_or_buffer) self.data = self._preprocess_data(data) @@ -726,7 +740,10 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) - obj = self._get_object_parser(self._combine_lines(data.split("\n"))) + data = data.split("\n") + if self.nrows: + data = data[:self.nrows] + obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) self.close() @@ -773,6 +790,11 @@ def close(self): pass def __next__(self): + if self.nrows: + if self.nrows_seen >= self.nrows: + self.close() + raise StopIteration + lines = list(islice(self.data, self.chunksize)) if lines: lines_json = self._combine_lines(lines) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index e531457627342..d301d1aa41e1a 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -130,6 +130,7 @@ def test_readjson_chunks_closes(chunksize): lines=True, chunksize=chunksize, compression=None, + nrows=None ) reader.read() assert ( @@ -179,3 +180,28 @@ def test_readjson_unicode(monkeypatch): result = read_json(path) expected = pd.DataFrame({"£©µÀÆÖÞßéöÿ": ["АБВГДабвгд가"]}) tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows", [1, 2]) +def test_readjson_nrows(nrows): + # Test reading line-format JSON to Series with nrows param + jsonl = '''{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}''' + result = pd.read_json(jsonl, lines=True, nrows=nrows) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(result, expected) + + +@pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) +def test_readjson_nrows_chunks(nrows, chunksize): + # Test reading line-format JSON to Series with nrows and chunksize param + jsonl = '''{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}''' + reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) + chunked = pd.concat(reader) + expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] + tm.assert_frame_equal(chunked, expected) From fc4993f6c9295e041bb057d278fd9b3101efea3f Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Mon, 4 May 2020 10:44:02 +0530 Subject: [PATCH 02/25] ENH solve linting via black8 for Add nrow parameter for line delimited json for read_json #33916 --- pandas/io/json/_json.py | 4 ++-- pandas/tests/io/json/test_readlines.py | 10 +++++----- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 3435ca8dddafb..f4940b9fc26ac 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -645,7 +645,7 @@ def __init__( lines, chunksize, compression, - nrows + nrows, ): self.path_or_buf = filepath_or_buffer @@ -742,7 +742,7 @@ def read(self): data = ensure_str(self.data) data = data.split("\n") if self.nrows: - data = data[:self.nrows] + data = data[: self.nrows] obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index d301d1aa41e1a..a86070c1614d2 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -130,7 +130,7 @@ def test_readjson_chunks_closes(chunksize): lines=True, chunksize=chunksize, compression=None, - nrows=None + nrows=None, ) reader.read() assert ( @@ -185,10 +185,10 @@ def test_readjson_unicode(monkeypatch): @pytest.mark.parametrize("nrows", [1, 2]) def test_readjson_nrows(nrows): # Test reading line-format JSON to Series with nrows param - jsonl = '''{"a": 1, "b": 2} + jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} - {"a": 7, "b": 8}''' + {"a": 7, "b": 8}""" result = pd.read_json(jsonl, lines=True, nrows=nrows) expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(result, expected) @@ -197,10 +197,10 @@ def test_readjson_nrows(nrows): @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) def test_readjson_nrows_chunks(nrows, chunksize): # Test reading line-format JSON to Series with nrows and chunksize param - jsonl = '''{"a": 1, "b": 2} + jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} - {"a": 7, "b": 8}''' + {"a": 7, "b": 8}""" reader = read_json(jsonl, lines=True, nrows=nrows, chunksize=chunksize) chunked = pd.concat(reader) expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] From 028d3988be67e9ded6239c8d5b09eb3c8551f955 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Thu, 7 May 2020 10:21:26 +0530 Subject: [PATCH 03/25] optimized list indexing and type hints added --- pandas/io/json/_json.py | 68 ++++++++++++++++++++++++----------------- 1 file changed, 40 insertions(+), 28 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index f4940b9fc26ac..7f04cd9cbb5c4 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type +from typing import Any, Callable, Optional, Type, Iterator import numpy as np @@ -350,20 +350,20 @@ def _write( ) def read_json( path_or_buf=None, - orient=None, - typ="frame", + orient: str = None, + typ: str = "frame", dtype=None, - convert_axes=None, + convert_axes: bool = None, convert_dates=True, - keep_default_dates=True, - numpy=False, - precise_float=False, - date_unit=None, - encoding=None, - lines=False, - chunksize=None, - compression="infer", - nrows=None, + keep_default_dates: bool = True, + numpy: bool = False, + precise_float: bool = False, + date_unit: str = None, + encoding: str = None, + lines: bool = False, + chunksize: Optional[int] = None, + compression: str = "infer", + nrows: int = None, ): """ Convert a JSON string to pandas object. @@ -495,10 +495,12 @@ def read_json( This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - chunksize : int, optional + nrows : int, default None The number of lines from the line-delimited jsonfile that has to be read. This can only be passed if `lines=True`. If this is None, all the rows will be returned. + .. versionadded:: 1.1 + compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use @@ -632,20 +634,20 @@ class JsonReader(abc.Iterator): def __init__( self, filepath_or_buffer, - orient, - typ, + orient: str, + typ: str, dtype, - convert_axes, + convert_axes: bool, convert_dates, - keep_default_dates, - numpy, - precise_float, - date_unit, - encoding, - lines, - chunksize, - compression, - nrows, + keep_default_dates: bool, + numpy: bool, + precise_float: bool, + date_unit: str, + encoding: str, + lines: bool, + chunksize: Optional[int], + compression: str, + nrows: int, ): self.path_or_buf = filepath_or_buffer @@ -732,6 +734,15 @@ def _combine_lines(self, lines) -> str: lines = filter(None, map(lambda x: x.strip(), lines)) return "[" + ",".join(lines) + "]" + def _jsonstring_to_list_generaor(self, data: str) -> Iterator[str]: + prev_index = -1 + while True: + next_index = data.find("\n", prev_index + 1) + if next_index < 0: + break + yield data[prev_index + 1 : next_index] + prev_index = next_index + def read(self): """ Read the whole JSON input into a pandas object. @@ -740,9 +751,10 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) - data = data.split("\n") if self.nrows: - data = data[: self.nrows] + data = list(islice(self._jsonstring_to_list_generaor(data), self.nrows)) + else: + data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) From 8765192e822137137fdfe57c9d3f3aa315fc4fec Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Thu, 7 May 2020 14:33:43 +0530 Subject: [PATCH 04/25] solved errors related to typing of args and linting issues --- pandas/io/json/_json.py | 43 ++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 22 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 7f04cd9cbb5c4..c80136231761c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,7 @@ from io import StringIO from itertools import islice import os -from typing import Any, Callable, Optional, Type, Iterator +from typing import Any, Callable, Iterator, Optional, Type import numpy as np @@ -350,20 +350,20 @@ def _write( ) def read_json( path_or_buf=None, - orient: str = None, - typ: str = "frame", + orient=None, + typ="frame", dtype=None, - convert_axes: bool = None, + convert_axes=None, convert_dates=True, keep_default_dates: bool = True, numpy: bool = False, precise_float: bool = False, - date_unit: str = None, - encoding: str = None, + date_unit=None, + encoding=None, lines: bool = False, chunksize: Optional[int] = None, - compression: str = "infer", - nrows: int = None, + compression="infer", + nrows: Optional[int] = None, ): """ Convert a JSON string to pandas object. @@ -495,13 +495,6 @@ def read_json( This can only be passed if `lines=True`. If this is None, the file will be read into memory all at once. - nrows : int, default None - The number of lines from the line-delimited jsonfile that has to be read. - This can only be passed if `lines=True`. - If this is None, all the rows will be returned. - .. versionadded:: 1.1 - - compression : {'infer', 'gzip', 'bz2', 'zip', 'xz', None}, default 'infer' For on-the-fly decompression of on-disk data. If 'infer', then use gzip, bz2, zip or xz if path_or_buf is a string ending in @@ -509,6 +502,12 @@ def read_json( otherwise. If using 'zip', the ZIP file must contain only one data file to be read in. Set to None for no decompression. + nrows : int, optional + The number of lines from the line-delimited jsonfile that has to be read. + This can only be passed if `lines=True`. + If this is None, all the rows will be returned. + .. versionadded:: 1.1 + Returns ------- Series or DataFrame @@ -634,20 +633,20 @@ class JsonReader(abc.Iterator): def __init__( self, filepath_or_buffer, - orient: str, - typ: str, + orient, + typ, dtype, - convert_axes: bool, + convert_axes, convert_dates, keep_default_dates: bool, numpy: bool, precise_float: bool, - date_unit: str, - encoding: str, + date_unit, + encoding, lines: bool, chunksize: Optional[int], - compression: str, - nrows: int, + compression, + nrows: Optional[int], ): self.path_or_buf = filepath_or_buffer From ca9c3e08cdd2b85a59e1c27b66d98a3278f50b1f Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 9 May 2020 22:18:31 +0530 Subject: [PATCH 05/25] use an iterator to slice strings --- pandas/io/json/_json.py | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c80136231761c..c36bbfdb62bc4 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,8 @@ from io import StringIO from itertools import islice import os -from typing import Any, Callable, Iterator, Optional, Type +import re +from typing import Any, Callable, Optional, Type import numpy as np @@ -733,15 +734,6 @@ def _combine_lines(self, lines) -> str: lines = filter(None, map(lambda x: x.strip(), lines)) return "[" + ",".join(lines) + "]" - def _jsonstring_to_list_generaor(self, data: str) -> Iterator[str]: - prev_index = -1 - while True: - next_index = data.find("\n", prev_index + 1) - if next_index < 0: - break - yield data[prev_index + 1 : next_index] - prev_index = next_index - def read(self): """ Read the whole JSON input into a pandas object. @@ -751,7 +743,9 @@ def read(self): elif self.lines: data = ensure_str(self.data) if self.nrows: - data = list(islice(self._jsonstring_to_list_generaor(data), self.nrows)) + compiled_pattern = re.compile("\n") + data_iterator = compiled_pattern.finditer("data") + data = list(islice(data_iterator, self.nrows)) else: data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) From b355f9ca9e6693923242393d09f8775a481ae002 Mon Sep 17 00:00:00 2001 From: Mohammad Hasnain Mohsin Rajan Date: Tue, 19 May 2020 09:36:21 +0530 Subject: [PATCH 06/25] Update pandas/io/json/_json.py fixed typo Co-authored-by: William Ayd --- pandas/io/json/_json.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c36bbfdb62bc4..401cc48649f9f 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -673,7 +673,7 @@ def __init__( if not self.lines: raise ValueError("chunksize can only be passed if lines=True") if self.nrows is not None: - self.chunksize = _validate_integer("nrows", self.nrows, 0) + self.nrows = _validate_integer("nrows", self.nrows, 0) if not self.lines: raise ValueError("nrows can only be passed if lines=True") From 74e9c2b9a7dfa19359e64562b9b53f78fdbf3a06 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Tue, 19 May 2020 21:44:58 +0530 Subject: [PATCH 07/25] fixed errors with nrows iterators --- pandas/io/json/_json.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 401cc48649f9f..ee744eaf45482 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -742,10 +742,22 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) + print(data) if self.nrows: compiled_pattern = re.compile("\n") - data_iterator = compiled_pattern.finditer("data") - data = list(islice(data_iterator, self.nrows)) + data_iterator = compiled_pattern.finditer(data) + data_surrogate = [] + start = 0 + nrows_seen = 0 + for vals in data_iterator: + if nrows_seen >= self.nrows: + break + begin, end = vals.span() + data_surrogate.append(data[start:begin].strip()) + start = end + nrows_seen += 1 + data = data_surrogate + print(data) else: data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) From 237010ef864f0c9fd35e6470ae08864523ec31e0 Mon Sep 17 00:00:00 2001 From: Mohammad Hasnain Mohsin Rajan Date: Wed, 20 May 2020 20:36:27 +0530 Subject: [PATCH 08/25] remove print statements --- pandas/io/json/_json.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index ee744eaf45482..31676ef25556b 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -742,7 +742,6 @@ def read(self): obj = concat(self) elif self.lines: data = ensure_str(self.data) - print(data) if self.nrows: compiled_pattern = re.compile("\n") data_iterator = compiled_pattern.finditer(data) @@ -757,7 +756,6 @@ def read(self): start = end nrows_seen += 1 data = data_surrogate - print(data) else: data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) From b0b0d69c147a310216e0ab198b0c4c631acfe31f Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Fri, 22 May 2020 21:29:18 +0530 Subject: [PATCH 09/25] refactor nrows for json files to use a simpler regular expression --- pandas/io/json/_json.py | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 31676ef25556b..1536cb2b7d0b7 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -743,19 +743,16 @@ def read(self): elif self.lines: data = ensure_str(self.data) if self.nrows: - compiled_pattern = re.compile("\n") + compiled_pattern = re.compile(".*\n") data_iterator = compiled_pattern.finditer(data) - data_surrogate = [] - start = 0 + data = [] nrows_seen = 0 + print(data_iterator) for vals in data_iterator: if nrows_seen >= self.nrows: break - begin, end = vals.span() - data_surrogate.append(data[start:begin].strip()) - start = end + data.append(vals.group(0)) nrows_seen += 1 - data = data_surrogate else: data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) From 0a6717ab881f1fef1915bcac66c41d97ba8146c2 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 23 May 2020 05:19:58 +0530 Subject: [PATCH 10/25] remove debug lines --- pandas/io/json/_json.py | 1 - 1 file changed, 1 deletion(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 1536cb2b7d0b7..734bb1bfb9070 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -747,7 +747,6 @@ def read(self): data_iterator = compiled_pattern.finditer(data) data = [] nrows_seen = 0 - print(data_iterator) for vals in data_iterator: if nrows_seen >= self.nrows: break From db50e925ecb82ad98dec4651e9a117e59046b5b0 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Mon, 25 May 2020 00:39:56 +0530 Subject: [PATCH 11/25] add test check if ValueError is raised if nrows is set and lines in not in pd.read_json() --- pandas/tests/io/json/test_readlines.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index a86070c1614d2..15c3b1891440d 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -205,3 +205,13 @@ def test_readjson_nrows_chunks(nrows, chunksize): chunked = pd.concat(reader) expected = pd.DataFrame({"a": [1, 3, 5, 7], "b": [2, 4, 6, 8]}).iloc[:nrows] tm.assert_frame_equal(chunked, expected) + + +def test_readjson_nrows_requires_lines(): + jsonl = """{"a": 1, "b": 2} + {"a": 3, "b": 4} + {"a": 5, "b": 6} + {"a": 7, "b": 8}""" + msg = "nrows can only be passed if lines=True" + with pytest.raises(ValueError, match=msg): + pd.read_json(jsonl, lines=False, nrows=2) From 3b139b37805632b5566ac2e9900045e7ed5a3d93 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Mon, 25 May 2020 10:23:09 +0530 Subject: [PATCH 12/25] refactor to use generators better --- pandas/io/json/_json.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index 734bb1bfb9070..c0450c231803c 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -744,14 +744,8 @@ def read(self): data = ensure_str(self.data) if self.nrows: compiled_pattern = re.compile(".*\n") - data_iterator = compiled_pattern.finditer(data) - data = [] - nrows_seen = 0 - for vals in data_iterator: - if nrows_seen >= self.nrows: - break - data.append(vals.group(0)) - nrows_seen += 1 + data = (line.group(0) for line in compiled_pattern.finditer(data)) + data = islice(data, self.nrows) else: data = data.split("\n") obj = self._get_object_parser(self._combine_lines(data)) From 730d6d8acad15fc44ca5782c20cf910428cec7fe Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Mon, 25 May 2020 12:26:17 +0530 Subject: [PATCH 13/25] Add related issue number and comments for tests --- pandas/tests/io/json/test_readlines.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/pandas/tests/io/json/test_readlines.py b/pandas/tests/io/json/test_readlines.py index 15c3b1891440d..53462eaaada8d 100644 --- a/pandas/tests/io/json/test_readlines.py +++ b/pandas/tests/io/json/test_readlines.py @@ -184,6 +184,7 @@ def test_readjson_unicode(monkeypatch): @pytest.mark.parametrize("nrows", [1, 2]) def test_readjson_nrows(nrows): + # GH 33916 # Test reading line-format JSON to Series with nrows param jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} @@ -196,6 +197,7 @@ def test_readjson_nrows(nrows): @pytest.mark.parametrize("nrows,chunksize", [(2, 2), (4, 2)]) def test_readjson_nrows_chunks(nrows, chunksize): + # GH 33916 # Test reading line-format JSON to Series with nrows and chunksize param jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} @@ -208,6 +210,8 @@ def test_readjson_nrows_chunks(nrows, chunksize): def test_readjson_nrows_requires_lines(): + # GH 33916 + # Test ValuError raised if nrows is set without setting lines in read_json jsonl = """{"a": 1, "b": 2} {"a": 3, "b": 4} {"a": 5, "b": 6} From b6a94990dd6b3efe79d85cdcbc7da6b72d1145a0 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Thu, 28 May 2020 12:38:03 +0530 Subject: [PATCH 14/25] use StringIO iterator for nrows as used in chunks --- pandas/io/json/_json.py | 24 ++++++++++++------------ 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/pandas/io/json/_json.py b/pandas/io/json/_json.py index c0450c231803c..1d751349b79e9 100644 --- a/pandas/io/json/_json.py +++ b/pandas/io/json/_json.py @@ -3,7 +3,6 @@ from io import StringIO from itertools import islice import os -import re from typing import Any, Callable, Optional, Type import numpy as np @@ -507,6 +506,7 @@ def read_json( The number of lines from the line-delimited jsonfile that has to be read. This can only be passed if `lines=True`. If this is None, all the rows will be returned. + .. versionadded:: 1.1 Returns @@ -688,9 +688,9 @@ def _preprocess_data(self, data): If self.chunksize, we prepare the data for the `__next__` method. Otherwise, we read it into memory for the `read` method. """ - if hasattr(data, "read") and not self.chunksize: + if hasattr(data, "read") and (not self.chunksize or not self.nrows): data = data.read() - if not hasattr(data, "read") and self.chunksize: + if not hasattr(data, "read") and (self.chunksize or self.nrows): data = StringIO(data) return data @@ -738,17 +738,17 @@ def read(self): """ Read the whole JSON input into a pandas object. """ - if self.lines and self.chunksize: - obj = concat(self) - elif self.lines: - data = ensure_str(self.data) - if self.nrows: - compiled_pattern = re.compile(".*\n") - data = (line.group(0) for line in compiled_pattern.finditer(data)) - data = islice(data, self.nrows) + if self.lines: + if self.chunksize: + obj = concat(self) + elif self.nrows: + lines = list(islice(self.data, self.nrows)) + lines_json = self._combine_lines(lines) + obj = self._get_object_parser(lines_json) else: + data = ensure_str(self.data) data = data.split("\n") - obj = self._get_object_parser(self._combine_lines(data)) + obj = self._get_object_parser(self._combine_lines(data)) else: obj = self._get_object_parser(self.data) self.close() From 8c830b4984c17e0e36a454af8e25d4fb379c285f Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 30 May 2020 03:02:03 +0530 Subject: [PATCH 15/25] add asv benchmarks for nrows in read_json --- asv_bench/benchmarks/io/json.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index f478bf2aee0ba..151613036c54d 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,12 +53,24 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def time_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=15000) + + def time_read_json_lines_nrows_larger(self, index): + read_json(self.fname, orient="records", lines=True, nrows=45000) + def peakmem_read_json_lines(self, index): read_json(self.fname, orient="records", lines=True) def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def peakmem_read_json_lines_nrows(self, index): + read_json(self.fname, orient="records", lines=True, nrows=15000) + + def peakmem_read_json_lines_nrows_larger(self, index): + read_json(self.fname, orient="records", lines=True, nrows=45000) + class ToJSON(BaseIO): From 5c55339f253d4909651a756fe1ebdd454865a30e Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 30 May 2020 14:10:43 +0530 Subject: [PATCH 16/25] add benchmarks to read a single chunk --- asv_bench/benchmarks/io/json.py | 36 +++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 151613036c54d..731a08c9d0806 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,6 +53,24 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def time_read_json_lines_read_one_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 0: + break + + def time_read_json_lines_read_two_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 1: + break + + def time_read_json_lines_read_three_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 2: + break + def time_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) @@ -65,6 +83,24 @@ def peakmem_read_json_lines(self, index): def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def peakmem_read_json_lines_one_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 0: + break + + def peakmem_read_json_lines_two_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 1: + break + + def peakmem_read_json_lines_three_chunk(self, index): + iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) + for i, j in enumerate(iterator): + if i == 2: + break + def peakmem_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) From d19309ad7d5a661321cb7a0bddf101faf22ad2de Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 30 May 2020 14:30:06 +0530 Subject: [PATCH 17/25] chunksize 1,100,10000 benchmarks --- asv_bench/benchmarks/io/json.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 731a08c9d0806..314ed9a9616a7 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,6 +53,15 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def time_read_json_lines_concat_one(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=1)) + + def time_read_json_lines_concat_hundred(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) + + def time_read_json_lines_concat_ten_thousand(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) + def time_read_json_lines_read_one_chunk(self, index): iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) for i, j in enumerate(iterator): @@ -83,6 +92,15 @@ def peakmem_read_json_lines(self, index): def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) + def peakmem_read_json_lines_concat_one(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=1)) + + def peakmem_read_json_lines_concat_hundred(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) + + def peakmem_read_json_lines_concat_ten_thousand(self, index): + concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) + def peakmem_read_json_lines_one_chunk(self, index): iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) for i, j in enumerate(iterator): From dec797e3afc9e76f08bd8baa4a0ab37b2257de3b Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 30 May 2020 16:21:57 +0530 Subject: [PATCH 18/25] remove wrong benchmarks --- asv_bench/benchmarks/io/json.py | 36 --------------------------------- 1 file changed, 36 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 314ed9a9616a7..a186743acf31c 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -62,24 +62,6 @@ def time_read_json_lines_concat_hundred(self, index): def time_read_json_lines_concat_ten_thousand(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) - def time_read_json_lines_read_one_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 0: - break - - def time_read_json_lines_read_two_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 1: - break - - def time_read_json_lines_read_three_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 2: - break - def time_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) @@ -101,24 +83,6 @@ def peakmem_read_json_lines_concat_hundred(self, index): def peakmem_read_json_lines_concat_ten_thousand(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) - def peakmem_read_json_lines_one_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 0: - break - - def peakmem_read_json_lines_two_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 1: - break - - def peakmem_read_json_lines_three_chunk(self, index): - iterator = read_json(self.fname, orient="records", lines=True, chunksize=25000) - for i, j in enumerate(iterator): - if i == 2: - break - def peakmem_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) From 91e0b94160fb442a60dc4aadb172a12d4a4062f1 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Sat, 30 May 2020 16:23:19 +0530 Subject: [PATCH 19/25] remove wrong benchmarks --- asv_bench/benchmarks/io/json.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index a186743acf31c..821f5f7b4bce1 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,9 +53,6 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) - def time_read_json_lines_concat_one(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=1)) - def time_read_json_lines_concat_hundred(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) @@ -74,9 +71,6 @@ def peakmem_read_json_lines(self, index): def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) - def peakmem_read_json_lines_concat_one(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=1)) - def peakmem_read_json_lines_concat_hundred(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) From c01039943d3a985a0e4dafbbb4e8e2986f20d36b Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Wed, 3 Jun 2020 16:59:52 +0530 Subject: [PATCH 20/25] add whatsnew and remove unwanted benchmarks --- asv_bench/benchmarks/io/json.py | 20 +------------------- doc/source/whatsnew/v1.1.0.rst | 1 + 2 files changed, 2 insertions(+), 19 deletions(-) diff --git a/asv_bench/benchmarks/io/json.py b/asv_bench/benchmarks/io/json.py index 821f5f7b4bce1..a490e250943f5 100644 --- a/asv_bench/benchmarks/io/json.py +++ b/asv_bench/benchmarks/io/json.py @@ -53,17 +53,8 @@ def time_read_json_lines(self, index): def time_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) - def time_read_json_lines_concat_hundred(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) - - def time_read_json_lines_concat_ten_thousand(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) - def time_read_json_lines_nrows(self, index): - read_json(self.fname, orient="records", lines=True, nrows=15000) - - def time_read_json_lines_nrows_larger(self, index): - read_json(self.fname, orient="records", lines=True, nrows=45000) + read_json(self.fname, orient="records", lines=True, nrows=25000) def peakmem_read_json_lines(self, index): read_json(self.fname, orient="records", lines=True) @@ -71,18 +62,9 @@ def peakmem_read_json_lines(self, index): def peakmem_read_json_lines_concat(self, index): concat(read_json(self.fname, orient="records", lines=True, chunksize=25000)) - def peakmem_read_json_lines_concat_hundred(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=100)) - - def peakmem_read_json_lines_concat_ten_thousand(self, index): - concat(read_json(self.fname, orient="records", lines=True, chunksize=10000)) - def peakmem_read_json_lines_nrows(self, index): read_json(self.fname, orient="records", lines=True, nrows=15000) - def peakmem_read_json_lines_nrows_larger(self, index): - read_json(self.fname, orient="records", lines=True, nrows=45000) - class ToJSON(BaseIO): diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index bfe2dcee40d5e..2c086718197e0 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -236,6 +236,7 @@ Other enhancements and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) +- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). .. --------------------------------------------------------------------------- From 2355fc547d5f797d7b44bfca21678c13d9cf71dc Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Wed, 3 Jun 2020 17:07:14 +0530 Subject: [PATCH 21/25] remove conflict --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 2c086718197e0..bfe2dcee40d5e 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -236,7 +236,6 @@ Other enhancements and :class:`~pandas.io.stata.StataWriterUTF8` (:issue:`26599`). - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) -- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). .. --------------------------------------------------------------------------- From 7fcf3dbd477bb769283db817a9a5c4e660f5adc5 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Wed, 3 Jun 2020 17:10:29 +0530 Subject: [PATCH 22/25] add whatsnew for nrows --- doc/source/whatsnew/v1.1.0.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index b4e29291bb12d..5ec23d2914df3 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,7 +288,7 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - +- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). .. --------------------------------------------------------------------------- Increased minimum versions for dependencies From 9e667a1e9cf767efd191a1804179c33b36f06bf5 Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Wed, 3 Jun 2020 17:52:50 +0530 Subject: [PATCH 23/25] solve doc error --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 5ec23d2914df3..75437e76f5357 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,6 +289,7 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). + .. --------------------------------------------------------------------------- Increased minimum versions for dependencies From cb3de4d6a137e610019e57a9bc69483fcc73334d Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Thu, 4 Jun 2020 06:27:59 +0530 Subject: [PATCH 24/25] remove merge conflict lines --- doc/source/whatsnew/v1.1.0.rst | 1 - 1 file changed, 1 deletion(-) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 75437e76f5357..b4e29291bb12d 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -288,7 +288,6 @@ Other enhancements - :meth:`HDFStore.put` now accepts `track_times` parameter. Parameter is passed to ``create_table`` method of ``PyTables`` (:issue:`32682`). - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). -- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). .. --------------------------------------------------------------------------- From 2ce74db8972c1014b05d1d186e6833a27227e71f Mon Sep 17 00:00:00 2001 From: hasnain2808 Date: Thu, 4 Jun 2020 06:30:04 +0530 Subject: [PATCH 25/25] added the conflicting line back --- doc/source/whatsnew/v1.1.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/source/whatsnew/v1.1.0.rst b/doc/source/whatsnew/v1.1.0.rst index 7834e1a5c4898..0f5c887aa3137 100644 --- a/doc/source/whatsnew/v1.1.0.rst +++ b/doc/source/whatsnew/v1.1.0.rst @@ -289,6 +289,7 @@ Other enhancements - Make :class:`pandas.core.window.Rolling` and :class:`pandas.core.window.Expanding` iterable(:issue:`11704`) - Make ``option_context`` a :class:`contextlib.ContextDecorator`, which allows it to be used as a decorator over an entire function (:issue:`34253`). - :meth:`groupby.transform` now allows ``func`` to be ``pad``, ``backfill`` and ``cumcount`` (:issue:`31269`). +- :meth:`~pandas.io.json.read_json` now accepts `nrows` parameter. (:issue:`33916`). .. ---------------------------------------------------------------------------