From 01b69305b784e62da43df200eb1f63ea14548619 Mon Sep 17 00:00:00 2001 From: Wes McKinney Date: Wed, 20 Jan 2016 22:33:11 -0800 Subject: [PATCH 1/2] CLN: fix all flake8 warnings in pandas/io --- pandas/io/api.py | 2 + pandas/io/clipboard.py | 2 +- pandas/io/common.py | 30 +- pandas/io/data.py | 2 + pandas/io/excel.py | 147 +- pandas/io/ga.py | 2 + pandas/io/gbq.py | 214 +- pandas/io/html.py | 16 +- pandas/io/json.py | 29 +- pandas/io/packers.py | 55 +- pandas/io/parsers.py | 152 +- pandas/io/pickle.py | 4 +- pandas/io/pytables.py | 193 +- pandas/io/sas.py | 87 +- pandas/io/sql.py | 100 +- pandas/io/stata.py | 125 +- .../io/tests/generate_legacy_storage_files.py | 81 +- pandas/io/tests/test_clipboard.py | 16 +- pandas/io/tests/test_common.py | 2 +- pandas/io/tests/test_cparser.py | 40 +- pandas/io/tests/test_data.py | 2 + pandas/io/tests/test_date_converters.py | 32 +- pandas/io/tests/test_excel.py | 266 +-- pandas/io/tests/test_ga.py | 2 + pandas/io/tests/test_gbq.py | 211 +- pandas/io/tests/test_html.py | 20 +- pandas/io/tests/test_json/test_pandas.py | 138 +- pandas/io/tests/test_json/test_ujson.py | 304 +-- pandas/io/tests/test_json_norm.py | 75 +- pandas/io/tests/test_packers.py | 153 +- pandas/io/tests/test_parsers.py | 691 ++++--- pandas/io/tests/test_pickle.py | 54 +- pandas/io/tests/test_pytables.py | 1806 ++++++++++------- pandas/io/tests/test_sas.py | 15 +- pandas/io/tests/test_sql.py | 415 ++-- pandas/io/tests/test_stata.py | 232 ++- pandas/io/tests/test_wb.py | 2 + pandas/io/wb.py | 2 + 38 files changed, 3271 insertions(+), 2448 deletions(-) diff --git a/pandas/io/api.py b/pandas/io/api.py index fedde462c74b7..3ac4c670c8466 100644 --- a/pandas/io/api.py +++ b/pandas/io/api.py @@ -2,6 +2,8 @@ Data IO api """ +# flake8: noqa + from pandas.io.parsers import read_csv, read_table, read_fwf from pandas.io.clipboard import read_clipboard from pandas.io.excel import ExcelFile, ExcelWriter, read_excel diff --git a/pandas/io/clipboard.py b/pandas/io/clipboard.py index dfa46156aaead..2109e1c5d6d4c 100644 --- a/pandas/io/clipboard.py +++ b/pandas/io/clipboard.py @@ -42,7 +42,7 @@ def read_clipboard(**kwargs): # pragma: no cover # 1 3 4 counts = set([x.lstrip().count('\t') for x in lines]) - if len(lines)>1 and len(counts) == 1 and counts.pop() != 0: + if len(lines) > 1 and len(counts) == 1 and counts.pop() != 0: kwargs['sep'] = '\t' if kwargs.get('sep') is None and kwargs.get('delim_whitespace') is None: diff --git a/pandas/io/common.py b/pandas/io/common.py index e46f609077810..811d42b7b4b9e 100644 --- a/pandas/io/common.py +++ b/pandas/io/common.py @@ -30,20 +30,19 @@ from urllib.request import urlopen, pathname2url _urlopen = urlopen from urllib.parse import urlparse as parse_url - import urllib.parse as compat_parse from urllib.parse import (uses_relative, uses_netloc, uses_params, urlencode, urljoin) from urllib.error import URLError - from http.client import HTTPException + from http.client import HTTPException # noqa else: from urllib2 import urlopen as _urlopen - from urllib import urlencode, pathname2url + from urllib import urlencode, pathname2url # noqa from urlparse import urlparse as parse_url from urlparse import uses_relative, uses_netloc, uses_params, urljoin - from urllib2 import URLError - from httplib import HTTPException - from contextlib import contextmanager, closing - from functools import wraps + from urllib2 import URLError # noqa + from httplib import HTTPException # noqa + from contextlib import contextmanager, closing # noqa + from functools import wraps # noqa # @wraps(_urlopen) @contextmanager @@ -66,6 +65,7 @@ class DtypeWarning(Warning): try: from boto.s3 import key + class BotoFileLikeReader(key.Key): """boto Key modified to be more file-like @@ -78,10 +78,12 @@ class BotoFileLikeReader(key.Key): Also adds a `readline` function which will split the returned values by the `\n` character. """ + def __init__(self, *args, **kwargs): encoding = kwargs.pop("encoding", None) # Python 2 compat super(BotoFileLikeReader, self).__init__(*args, **kwargs) - self.finished_read = False # Add a flag to mark the end of the read. + # Add a flag to mark the end of the read. + self.finished_read = False self.buffer = "" self.lines = [] if encoding is None and compat.PY3: @@ -121,7 +123,8 @@ def readline(self): raise StopIteration if self.encoding: - self.buffer = "{}{}".format(self.buffer, self.read(8192).decode(self.encoding)) + self.buffer = "{}{}".format( + self.buffer, self.read(8192).decode(self.encoding)) else: self.buffer = "{}{}".format(self.buffer, self.read(8192)) @@ -211,6 +214,7 @@ def _expand_user(filepath_or_buffer): return os.path.expanduser(filepath_or_buffer) return filepath_or_buffer + def _validate_header_arg(header): if isinstance(header, bool): raise TypeError("Passing a bool to header is invalid. " @@ -218,6 +222,7 @@ def _validate_header_arg(header): "header=int or list-like of ints to specify " "the row(s) making up the column names") + def _stringify_path(filepath_or_buffer): """Return the argument coerced to a string if it was a pathlib.Path or a py.path.local @@ -263,8 +268,9 @@ def get_filepath_or_buffer(filepath_or_buffer, encoding=None, else: compression = None # cat on the compression to the tuple returned by the function - to_return = list(maybe_read_encoded_stream(req, encoding, compression)) + \ - [compression] + to_return = (list(maybe_read_encoded_stream(req, encoding, + compression)) + + [compression]) return tuple(to_return) if _is_s3_url(filepath_or_buffer): @@ -467,4 +473,4 @@ def _check_as_is(x): # write to the target stream self.stream.write(data) # empty queue - self.queue.truncate(0) \ No newline at end of file + self.queue.truncate(0) diff --git a/pandas/io/data.py b/pandas/io/data.py index ac6f14e846bec..5fa440e7bb1ff 100644 --- a/pandas/io/data.py +++ b/pandas/io/data.py @@ -3,6 +3,8 @@ """ +# flake8: noqa + import warnings import tempfile import datetime as dt diff --git a/pandas/io/excel.py b/pandas/io/excel.py index 106d263f56093..0642079cc5b34 100644 --- a/pandas/io/excel.py +++ b/pandas/io/excel.py @@ -2,23 +2,24 @@ Module parse to/from Excel """ -#---------------------------------------------------------------------- +# --------------------------------------------------------------------- # ExcelFile class +from datetime import datetime, date, time, MINYEAR + import os -import datetime import abc import numpy as np from pandas.core.frame import DataFrame from pandas.io.parsers import TextParser -from pandas.io.common import _is_url, _urlopen, _validate_header_arg, get_filepath_or_buffer, _is_s3_url +from pandas.io.common import (_is_url, _urlopen, _validate_header_arg, + get_filepath_or_buffer, _is_s3_url) from pandas.tseries.period import Period from pandas import json from pandas.compat import (map, zip, reduce, range, lrange, u, add_metaclass, - BytesIO, string_types) + string_types) from pandas.core import config from pandas.core.common import pprint_thing -from pandas.util.decorators import Appender import pandas.compat as compat import pandas.compat.openpyxl_compat as openpyxl_compat import pandas.core.common as com @@ -56,11 +57,11 @@ def get_writer(engine_name): # with version-less openpyxl engine # make sure we make the intelligent choice for the user if LooseVersion(openpyxl.__version__) < '2.0.0': - return _writers['openpyxl1'] + return _writers['openpyxl1'] elif LooseVersion(openpyxl.__version__) < '2.2.0': - return _writers['openpyxl20'] + return _writers['openpyxl20'] else: - return _writers['openpyxl22'] + return _writers['openpyxl22'] except ImportError: # fall through to normal exception handling below pass @@ -70,6 +71,7 @@ def get_writer(engine_name): except KeyError: raise ValueError("No Excel writer '%s'" % engine_name) + def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, names=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, @@ -86,15 +88,16 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, file could be file://localhost/path/to/workbook.xlsx sheetname : string, int, mixed list of strings/ints, or None, default 0 - Strings are used for sheet names, Integers are used in zero-indexed sheet - positions. + Strings are used for sheet names, Integers are used in zero-indexed + sheet positions. Lists of strings/integers are used to request multiple sheets. Specify None to get all sheets. str|int -> DataFrame is returned. - list|None -> Dict of DataFrames is returned, with keys representing sheets. + list|None -> Dict of DataFrames is returned, with keys representing + sheets. Available Cases @@ -150,18 +153,16 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, data will be read in as floats: Excel stores all numbers as floats internally has_index_names : boolean, default None - DEPRECATED: for version 0.17+ index names will be automatically inferred - based on index_col. To read Excel output from 0.16.2 and prior that - had saved index names, use True. + DEPRECATED: for version 0.17+ index names will be automatically + inferred based on index_col. To read Excel output from 0.16.2 and + prior that had saved index names, use True. Returns ------- parsed : DataFrame or Dict of DataFrames - DataFrame from the passed in Excel file. See notes in sheetname argument - for more information on when a Dict of Dataframes is returned. - + DataFrame from the passed in Excel file. See notes in sheetname + argument for more information on when a Dict of Dataframes is returned. """ - if not isinstance(io, ExcelFile): io = ExcelFile(io, engine=engine) @@ -172,6 +173,7 @@ def read_excel(io, sheetname=0, header=0, skiprows=None, skip_footer=0, convert_float=convert_float, has_index_names=has_index_names, skip_footer=skip_footer, converters=converters, **kwds) + class ExcelFile(object): """ Class for parsing tabular excel sheets into DataFrame objects. @@ -185,6 +187,7 @@ class ExcelFile(object): If io is not a buffer or path, this must be set to identify io. Acceptable values are None or xlrd """ + def __init__(self, io, **kwds): import xlrd # throw an ImportError if we need to @@ -223,7 +226,8 @@ def __init__(self, io, **kwds): def parse(self, sheetname=0, header=0, skiprows=None, skip_footer=0, index_col=None, parse_cols=None, parse_dates=False, date_parser=None, na_values=None, thousands=None, - convert_float=True, has_index_names=None, converters=None, **kwds): + convert_float=True, has_index_names=None, + converters=None, **kwds): """ Parse specified sheet(s) into a DataFrame @@ -313,7 +317,7 @@ def _parse_excel(self, sheetname=0, header=0, skiprows=None, skip_footer=0, epoch1904 = self.book.datemode - def _parse_cell(cell_contents,cell_typ): + def _parse_cell(cell_contents, cell_typ): """converts the contents of the cell into a pandas appropriate object""" @@ -327,20 +331,20 @@ def _parse_cell(cell_contents,cell_typ): # so we treat dates on the epoch as times only. # Also, Excel supports 1900 and 1904 epochs. year = (cell_contents.timetuple())[0:3] - if ((not epoch1904 and year == (1899, 12, 31)) - or (epoch1904 and year == (1904, 1, 1))): - cell_contents = datetime.time(cell_contents.hour, - cell_contents.minute, - cell_contents.second, - cell_contents.microsecond) + if ((not epoch1904 and year == (1899, 12, 31)) or + (epoch1904 and year == (1904, 1, 1))): + cell_contents = time(cell_contents.hour, + cell_contents.minute, + cell_contents.second, + cell_contents.microsecond) else: # Use the xlrd <= 0.9.2 date handling. dt = xldate.xldate_as_tuple(cell_contents, epoch1904) - if dt[0] < datetime.MINYEAR: - cell_contents = datetime.time(*dt[3:]) + if dt[0] < MINYEAR: + cell_contents = time(*dt[3:]) else: - cell_contents = datetime.datetime(*dt) + cell_contents = datetime(*dt) elif cell_typ == XL_CELL_ERROR: cell_contents = np.nan @@ -362,7 +366,7 @@ def _parse_cell(cell_contents,cell_typ): ret_dict = False - #Keep sheetname to maintain backwards compatibility. + # Keep sheetname to maintain backwards compatibility. if isinstance(sheetname, list): sheets = sheetname ret_dict = True @@ -372,7 +376,7 @@ def _parse_cell(cell_contents,cell_typ): else: sheets = [sheetname] - #handle same-type duplicates. + # handle same-type duplicates. sheets = list(set(sheets)) output = {} @@ -397,7 +401,7 @@ def _parse_cell(cell_contents,cell_typ): should_parse[j] = self._should_parse(j, parse_cols) if parse_cols is None or should_parse[j]: - row.append(_parse_cell(value,typ)) + row.append(_parse_cell(value, typ)) data.append(row) if sheet.nrows == 0: @@ -416,7 +420,8 @@ def _parse_cell(cell_contents,cell_typ): if com.is_integer(skiprows): row += skiprows data[row] = _fill_mi_header(data[row]) - header_name, data[row] = _pop_header_name(data[row], index_col) + header_name, data[row] = _pop_header_name( + data[row], index_col) header_names.append(header_name) else: data[header] = _trim_excel_header(data[header]) @@ -450,14 +455,14 @@ def _parse_cell(cell_contents,cell_typ): **kwds) output[asheetname] = parser.read() - output[asheetname].columns = output[asheetname].columns.set_names(header_names) + output[asheetname].columns = output[ + asheetname].columns.set_names(header_names) if ret_dict: return output else: return output[asheetname] - @property def sheet_names(self): return self.book.sheet_names() @@ -481,6 +486,7 @@ def _trim_excel_header(row): row = row[1:] return row + def _fill_mi_header(row): # forward fill blanks entries # from headers if parsing as MultiIndex @@ -493,6 +499,8 @@ def _fill_mi_header(row): return row # fill blank if index_col not None + + def _pop_header_name(row, index_col): """ (header, new_data) for header rows in MultiIndex parsing""" none_fill = lambda x: None if x == '' else x @@ -503,7 +511,8 @@ def _pop_header_name(row, index_col): else: # pop out header name and fill w/ blank i = index_col if not com.is_list_like(index_col) else max(index_col) - return none_fill(row[i]), row[:i] + [''] + row[i+1:] + return none_fill(row[i]), row[:i] + [''] + row[i + 1:] + def _conv_value(val): # Convert numpy types to Python types for the Excel writers. @@ -722,9 +731,8 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for cell in cells: colletter = get_column_letter(startcol + cell.col + 1) xcell = wks.cell("%s%s" % (colletter, startrow + cell.row + 1)) - if (isinstance(cell.val, compat.string_types) - and xcell.data_type_for_value(cell.val) - != xcell.TYPE_STRING): + if (isinstance(cell.val, compat.string_types) and + xcell.data_type_for_value(cell.val) != xcell.TYPE_STRING): xcell.set_value_explicit(cell.val) else: xcell.value = _conv_value(cell.val) @@ -735,9 +743,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): xcell.style.__setattr__(field, style.__getattribute__(field)) - if isinstance(cell.val, datetime.datetime): + if isinstance(cell.val, datetime): xcell.style.number_format.format_code = self.datetime_format - elif isinstance(cell.val, datetime.date): + elif isinstance(cell.val, date): xcell.style.number_format.format_code = self.date_format if cell.mergestart is not None and cell.mergeend is not None: @@ -825,12 +833,12 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): style_kwargs = {} # Apply format codes before cell.style to allow override - if isinstance(cell.val, datetime.datetime): + if isinstance(cell.val, datetime): style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format':{'format_code': self.datetime_format}})) - elif isinstance(cell.val, datetime.date): + 'number_format': {'format_code': self.datetime_format}})) + elif isinstance(cell.val, date): style_kwargs.update(self._convert_to_style_kwargs({ - 'number_format':{'format_code': self.date_format}})) + 'number_format': {'format_code': self.date_format}})) if cell.style: style_kwargs.update(self._convert_to_style_kwargs(cell.style)) @@ -896,14 +904,13 @@ def _convert_to_style_kwargs(cls, style_dict): if k in _style_key_map: k = _style_key_map[k] _conv_to_x = getattr(cls, '_convert_to_{0}'.format(k), - lambda x: None) + lambda x: None) new_v = _conv_to_x(v) if new_v: style_kwargs[k] = new_v return style_kwargs - @classmethod def _convert_to_color(cls, color_spec): """ @@ -932,7 +939,6 @@ def _convert_to_color(cls, color_spec): else: return Color(**color_spec) - @classmethod def _convert_to_font(cls, font_dict): """ @@ -981,7 +987,6 @@ def _convert_to_font(cls, font_dict): return Font(**font_kwargs) - @classmethod def _convert_to_stop(cls, stop_seq): """ @@ -999,7 +1004,6 @@ def _convert_to_stop(cls, stop_seq): return map(cls._convert_to_color, stop_seq) - @classmethod def _convert_to_fill(cls, fill_dict): """ @@ -1064,7 +1068,6 @@ def _convert_to_fill(cls, fill_dict): except TypeError: return GradientFill(**gfill_kwargs) - @classmethod def _convert_to_side(cls, side_spec): """ @@ -1100,7 +1103,6 @@ def _convert_to_side(cls, side_spec): return Side(**side_kwargs) - @classmethod def _convert_to_border(cls, border_dict): """ @@ -1144,7 +1146,6 @@ def _convert_to_border(cls, border_dict): return Border(**border_kwargs) - @classmethod def _convert_to_alignment(cls, alignment_dict): """ @@ -1168,7 +1169,6 @@ def _convert_to_alignment(cls, alignment_dict): return Alignment(**alignment_dict) - @classmethod def _convert_to_number_format(cls, number_format_dict): """ @@ -1212,6 +1212,7 @@ def _convert_to_protection(cls, protection_dict): register_writer(_Openpyxl20Writer) + class _Openpyxl22Writer(_Openpyxl20Writer): """ Note: Support for OpenPyxl v2.2 is currently EXPERIMENTAL (GH7565). @@ -1221,8 +1222,6 @@ class _Openpyxl22Writer(_Openpyxl20Writer): def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): # Write the frame cells using openpyxl. - from openpyxl import styles - sheet_name = self._get_sheet_name(sheet_name) _style_cache = {} @@ -1236,9 +1235,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): for cell in cells: xcell = wks.cell( - row=startrow + cell.row + 1, - column=startcol + cell.col + 1 - ) + row=startrow + cell.row + 1, + column=startcol + cell.col + 1 + ) xcell.value = _conv_value(cell.val) style_kwargs = {} @@ -1256,14 +1255,15 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): if cell.mergestart is not None and cell.mergeend is not None: wks.merge_cells( - start_row=startrow + cell.row + 1, - start_column=startcol + cell.col + 1, - end_column=startcol + cell.mergeend + 1, - end_row=startrow + cell.mergestart + 1 - ) + start_row=startrow + cell.row + 1, + start_column=startcol + cell.col + 1, + end_column=startcol + cell.mergeend + 1, + end_row=startrow + cell.mergestart + 1 + ) # When cells are merged only the top-left cell is preserved - # The behaviour of the other cells in a merged range is undefined + # The behaviour of the other cells in a merged range is + # undefined if style_kwargs: first_row = startrow + cell.row + 1 last_row = startrow + cell.mergestart + 1 @@ -1281,6 +1281,7 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): register_writer(_Openpyxl22Writer) + class _XlwtWriter(ExcelWriter): engine = 'xlwt' supported_extensions = ('.xls',) @@ -1320,9 +1321,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): val = _conv_value(cell.val) num_format_str = None - if isinstance(cell.val, datetime.datetime): + if isinstance(cell.val, datetime): num_format_str = self.datetime_format - elif isinstance(cell.val, datetime.date): + elif isinstance(cell.val, date): num_format_str = self.date_format stylekey = json.dumps(cell.style) @@ -1443,9 +1444,9 @@ def write_cells(self, cells, sheet_name=None, startrow=0, startcol=0): val = _conv_value(cell.val) num_format_str = None - if isinstance(cell.val, datetime.datetime): + if isinstance(cell.val, datetime): num_format_str = self.datetime_format - elif isinstance(cell.val, datetime.date): + elif isinstance(cell.val, date): num_format_str = self.date_format stylekey = json.dumps(cell.style) @@ -1500,11 +1501,11 @@ def _convert_to_style(self, style_dict, num_format_str=None): # Map the alignment to XlsxWriter alignment properties. alignment = style_dict.get('alignment') if alignment: - if (alignment.get('horizontal') - and alignment['horizontal'] == 'center'): + if (alignment.get('horizontal') and + alignment['horizontal'] == 'center'): xl_format.set_align('center') - if (alignment.get('vertical') - and alignment['vertical'] == 'top'): + if (alignment.get('vertical') and + alignment['vertical'] == 'top'): xl_format.set_align('top') # Map the cell borders to XlsxWriter border properties. diff --git a/pandas/io/ga.py b/pandas/io/ga.py index a6f9c9ed9467f..6dd0bb7472c37 100644 --- a/pandas/io/ga.py +++ b/pandas/io/ga.py @@ -4,6 +4,8 @@ 3. Goto APIs and register for OAuth2.0 for installed applications 4. Download JSON secret file and move into same directory as this file """ +# flake8: noqa + from datetime import datetime import re from pandas import compat diff --git a/pandas/io/gbq.py b/pandas/io/gbq.py index fff36a82529e3..4bf46f199c34a 100644 --- a/pandas/io/gbq.py +++ b/pandas/io/gbq.py @@ -12,9 +12,9 @@ from pandas.core.api import DataFrame from pandas.tools.merge import concat from pandas.core.common import PandasError -from pandas.util.decorators import deprecate from pandas.compat import lzip, bytes_to_str + def _check_google_client_version(): try: @@ -28,11 +28,16 @@ def _check_google_client_version(): else: google_api_minimum_version = '1.2.0' - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution( + 'google-api-python-client').version - if StrictVersion(_GOOGLE_API_CLIENT_VERSION) < StrictVersion(google_api_minimum_version): - raise ImportError("pandas requires google-api-python-client >= {0} for Google BigQuery support, " - "current version {1}".format(google_api_minimum_version, _GOOGLE_API_CLIENT_VERSION)) + if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) < + StrictVersion(google_api_minimum_version)): + raise ImportError("pandas requires google-api-python-client >= {0} " + "for Google BigQuery support, " + "current version {1}" + .format(google_api_minimum_version, + _GOOGLE_API_CLIENT_VERSION)) logger = logging.getLogger('pandas.io.gbq') logger.setLevel(logging.ERROR) @@ -87,7 +92,8 @@ class InvalidSchema(PandasError, ValueError): class NotFoundException(PandasError, ValueError): """ - Raised when the project_id, table or dataset provided in the query could not be found. + Raised when the project_id, table or dataset provided in the query could + not be found. """ pass @@ -118,15 +124,16 @@ def __init__(self, project_id, reauth=False): def test_google_api_imports(self): try: - import httplib2 - from apiclient.discovery import build - from apiclient.errors import HttpError - from oauth2client.client import AccessTokenRefreshError - from oauth2client.client import OAuth2WebServerFlow - from oauth2client.file import Storage - from oauth2client.tools import run_flow, argparser + import httplib2 # noqa + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa + from oauth2client.client import AccessTokenRefreshError # noqa + from oauth2client.client import OAuth2WebServerFlow # noqa + from oauth2client.file import Storage # noqa + from oauth2client.tools import run_flow, argparser # noqa except ImportError as e: - raise ImportError("Missing module required for Google BigQuery support: {0}".format(str(e))) + raise ImportError("Missing module required for Google BigQuery " + "support: {0}".format(str(e))) def get_credentials(self): from oauth2client.client import OAuth2WebServerFlow @@ -135,10 +142,12 @@ def get_credentials(self): _check_google_client_version() - flow = OAuth2WebServerFlow(client_id='495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd.apps.googleusercontent.com', - client_secret='kOc9wMptUtxkcIFbtZCcrEAc', - scope='https://www.googleapis.com/auth/bigquery', - redirect_uri='urn:ietf:wg:oauth:2.0:oob') + flow = OAuth2WebServerFlow( + client_id=('495642085510-k0tmvj2m941jhre2nbqka17vqpjfddtd' + '.apps.googleusercontent.com'), + client_secret='kOc9wMptUtxkcIFbtZCcrEAc', + scope='https://www.googleapis.com/auth/bigquery', + redirect_uri='urn:ietf:wg:oauth:2.0:oob') storage = Storage('bigquery_credentials.dat') credentials = storage.get() @@ -163,7 +172,8 @@ def get_service(credentials): @staticmethod def process_http_error(ex): - # See `BigQuery Troubleshooting Errors `__ + # See `BigQuery Troubleshooting Errors + # `__ status = json.loads(bytes_to_str(ex.content))['error'] errors = status.get('errors', None) @@ -173,7 +183,8 @@ def process_http_error(ex): reason = error['reason'] message = error['message'] - raise GenericGBQException("Reason: {0}, Message: {1}".format(reason, message)) + raise GenericGBQException( + "Reason: {0}, Message: {1}".format(reason, message)) raise GenericGBQException(errors) @@ -186,13 +197,17 @@ def process_insert_errors(insert_errors, verbose): reason = error['reason'] message = error['message'] location = error['location'] - error_message = 'Error at Row: {0}, Reason: {1}, Location: {2}, Message: {3}'.format(row, reason, location, message) + error_message = ('Error at Row: {0}, Reason: {1}, ' + 'Location: {2}, Message: {3}' + .format(row, reason, location, message)) # Report all error messages if verbose is set if verbose: print(error_message) else: - raise StreamingInsertError(error_message + '\nEnable verbose logging to see all errors') + raise StreamingInsertError(error_message + + '\nEnable verbose logging to ' + 'see all errors') raise StreamingInsertError @@ -207,15 +222,18 @@ def run_query(self, query, verbose=True): 'configuration': { 'query': { 'query': query - # 'allowLargeResults', 'createDisposition', 'preserveNulls', destinationTable, useQueryCache + # 'allowLargeResults', 'createDisposition', + # 'preserveNulls', destinationTable, useQueryCache } } } try: - query_reply = job_collection.insert(projectId=self.project_id, body=job_data).execute() + query_reply = job_collection.insert( + projectId=self.project_id, body=job_data).execute() except AccessTokenRefreshError: - raise AccessDenied("The credentials have been revoked or expired, please re-run the application " + raise AccessDenied("The credentials have been revoked or expired, " + "please re-run the application " "to re-authorize") except HttpError as ex: self.process_http_error(ex) @@ -226,8 +244,9 @@ def run_query(self, query, verbose=True): if verbose: print('Waiting for job to complete...') try: - query_reply = job_collection.getQueryResults(projectId=job_reference['projectId'], - jobId=job_reference['jobId']).execute() + query_reply = job_collection.getQueryResults( + projectId=job_reference['projectId'], + jobId=job_reference['jobId']).execute() except HttpError as ex: self.process_http_error(ex) @@ -246,9 +265,9 @@ def run_query(self, query, verbose=True): page_token = query_reply.get('pageToken', None) if not page_token and current_row < total_rows: - raise InvalidPageToken( - "Required pageToken was missing. Received {0} of {1} rows".format(current_row, - total_rows)) + raise InvalidPageToken("Required pageToken was missing. " + "Received {0} of {1} rows" + .format(current_row, total_rows)) elif page_token in seen_page_tokens: raise InvalidPageToken("A duplicate pageToken was returned") @@ -257,9 +276,9 @@ def run_query(self, query, verbose=True): try: query_reply = job_collection.getQueryResults( - projectId=job_reference['projectId'], - jobId=job_reference['jobId'], - pageToken=page_token).execute() + projectId=job_reference['projectId'], + jobId=job_reference['jobId'], + pageToken=page_token).execute() except HttpError as ex: self.process_http_error(ex) @@ -290,23 +309,28 @@ def load_data(self, dataframe, dataset_id, table_id, chunksize, verbose): if (len(rows) % chunksize == 0) or (remaining_rows == 0): if verbose: - print("\rStreaming Insert is {0}% Complete".format(((total_rows - remaining_rows) * 100) / total_rows)) + print("\rStreaming Insert is {0}% Complete".format( + ((total_rows - remaining_rows) * 100) / total_rows)) body = {'rows': rows} try: response = self.service.tabledata().insertAll( - projectId = self.project_id, - datasetId = dataset_id, - tableId = table_id, - body = body).execute() + projectId=self.project_id, + datasetId=dataset_id, + tableId=table_id, + body=body).execute() except HttpError as ex: self.process_http_error(ex) - # For streaming inserts, even if you receive a success HTTP response code, you'll need to check the - # insertErrors property of the response to determine if the row insertions were successful, because - # it's possible that BigQuery was only partially successful at inserting the rows. - # See the `Success HTTP Response Codes `__ + # For streaming inserts, even if you receive a success HTTP + # response code, you'll need to check the insertErrors property + # of the response to determine if the row insertions were + # successful, because it's possible that BigQuery was only + # partially successful at inserting the rows. See the `Success + # HTTP Response Codes + # `__ # section insert_errors = response.get('insertErrors', None) @@ -332,16 +356,20 @@ def verify_schema(self, dataset_id, table_id, schema): except HttpError as ex: self.process_http_error(ex) - def delete_and_recreate_table(self, dataset_id, table_id, table_schema, verbose): + def delete_and_recreate_table(self, dataset_id, table_id, + table_schema, verbose): delay = 0 - # Changes to table schema may take up to 2 minutes as of May 2015 - # See `Issue 191 `__ - # Compare previous schema with new schema to determine if there should be a 120 second delay + # Changes to table schema may take up to 2 minutes as of May 2015 See + # `Issue 191 + # `__ + # Compare previous schema with new schema to determine if there should + # be a 120 second delay if not self.verify_schema(dataset_id, table_id, table_schema): if verbose: - print('The existing table has a different schema. Please wait 2 minutes. See Google BigQuery issue #191') + print('The existing table has a different schema. ' + 'Please wait 2 minutes. See Google BigQuery issue #191') delay = 120 table = _Table(self.project_id, dataset_id) @@ -351,10 +379,13 @@ def delete_and_recreate_table(self, dataset_id, table_id, table_schema, verbose) def _parse_data(schema, rows): - # see: http://pandas.pydata.org/pandas-docs/dev/missing_data.html#missing-data-casting-rules-and-indexing + # see: + # http://pandas.pydata.org/pandas-docs/dev/missing_data.html + # #missing-data-casting-rules-and-indexing dtype_map = {'INTEGER': np.dtype(float), 'FLOAT': np.dtype(float), - 'TIMESTAMP': 'M8[ns]'} # This seems to be buggy without nanosecond indicator + # This seems to be buggy without nanosecond indicator + 'TIMESTAMP': 'M8[ns]'} fields = schema['fields'] col_types = [field['type'] for field in fields] @@ -386,15 +417,17 @@ def _parse_entry(field_value, field_type): return field_value -def read_gbq(query, project_id=None, index_col=None, col_order=None, reauth=False, verbose=True): +def read_gbq(query, project_id=None, index_col=None, col_order=None, + reauth=False, verbose=True): """Load data from Google BigQuery. THIS IS AN EXPERIMENTAL LIBRARY - The main method a user calls to execute a Query in Google BigQuery and read results - into a pandas DataFrame using the v2 Google API client for Python. Documentation for - the API is available at https://developers.google.com/api-client-library/python/. - Authentication to the Google BigQuery service is via OAuth 2.0 using the product name + The main method a user calls to execute a Query in Google BigQuery and read + results into a pandas DataFrame using the v2 Google API client for Python. + Documentation for the API is available at + https://developers.google.com/api-client-library/python/. Authentication + to the Google BigQuery service is via OAuth 2.0 using the product name 'pandas GBQ'. Parameters @@ -493,7 +526,8 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, raise ValueError("'{0}' is not valid for if_exists".format(if_exists)) if '.' not in destination_table: - raise NotFoundException("Invalid Table Name. Should be of the form 'datasetId.tableId' ") + raise NotFoundException( + "Invalid Table Name. Should be of the form 'datasetId.tableId' ") connector = GbqConnector(project_id, reauth=reauth) dataset_id, table_id = destination_table.rsplit('.', 1) @@ -505,14 +539,19 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, # If table exists, check if_exists parameter if table.exists(table_id): if if_exists == 'fail': - raise TableCreationError("Could not create the table because it already exists. " - "Change the if_exists parameter to append or replace data.") + raise TableCreationError("Could not create the table because it " + "already exists. " + "Change the if_exists parameter to " + "append or replace data.") elif if_exists == 'replace': - connector.delete_and_recreate_table(dataset_id, table_id, table_schema, verbose) + connector.delete_and_recreate_table( + dataset_id, table_id, table_schema, verbose) elif if_exists == 'append': if not connector.verify_schema(dataset_id, table_id, table_schema): - raise InvalidSchema("Please verify that the column order, structure and data types in the DataFrame " - "match the schema of the destination table.") + raise InvalidSchema("Please verify that the column order, " + "structure and data types in the " + "DataFrame match the schema of the " + "destination table.") else: table.create(table_id, table_schema) @@ -520,13 +559,13 @@ def to_gbq(dataframe, destination_table, project_id, chunksize=10000, def generate_bq_schema(df, default_type='STRING'): - # deprecation TimeSeries, #11121 - warnings.warn("generate_bq_schema is deprecated and will be removed in a future version", - FutureWarning, stacklevel=2) + warnings.warn("generate_bq_schema is deprecated and will be removed in " + "a future version", FutureWarning, stacklevel=2) return _generate_bq_schema(df, default_type=default_type) + def _generate_bq_schema(df, default_type='STRING'): """ Given a passed df, generate the associated Google BigQuery schema. @@ -555,6 +594,7 @@ def _generate_bq_schema(df, default_type='STRING'): return {'fields': fields} + class _Table(GbqConnector): def __init__(self, project_id, dataset_id, reauth=False): @@ -585,9 +625,9 @@ def exists(self, table_id): try: self.service.tables().get( - projectId=self.project_id, - datasetId=self.dataset_id, - tableId=table_id).execute() + projectId=self.project_id, + datasetId=self.dataset_id, + tableId=table_id).execute() return True except self.http_error as ex: if ex.resp.status == 404: @@ -605,11 +645,13 @@ def create(self, table_id, schema): table : str Name of table to be written schema : str - Use the generate_bq_schema to generate your table schema from a dataframe. + Use the generate_bq_schema to generate your table schema from a + dataframe. """ if self.exists(table_id): - raise TableCreationError("The table could not be created because it already exists") + raise TableCreationError( + "The table could not be created because it already exists") if not _Dataset(self.project_id).exists(self.dataset_id): _Dataset(self.project_id).create(self.dataset_id) @@ -625,9 +667,9 @@ def create(self, table_id, schema): try: self.service.tables().insert( - projectId=self.project_id, - datasetId=self.dataset_id, - body=body).execute() + projectId=self.project_id, + datasetId=self.dataset_id, + body=body).execute() except self.http_error as ex: self.process_http_error(ex) @@ -647,9 +689,9 @@ def delete(self, table_id): try: self.service.tables().delete( - datasetId=self.dataset_id, - projectId=self.project_id, - tableId=table_id).execute() + datasetId=self.dataset_id, + projectId=self.project_id, + tableId=table_id).execute() except self.http_error as ex: self.process_http_error(ex) @@ -683,8 +725,8 @@ def exists(self, dataset_id): try: self.service.datasets().get( - projectId=self.project_id, - datasetId=dataset_id).execute() + projectId=self.project_id, + datasetId=dataset_id).execute() return True except self.http_error as ex: if ex.resp.status == 404: @@ -709,7 +751,7 @@ def datasets(self): try: list_dataset_response = self.service.datasets().list( - projectId=self.project_id).execute().get('datasets', None) + projectId=self.project_id).execute().get('datasets', None) if not list_dataset_response: return [] @@ -735,7 +777,8 @@ def create(self, dataset_id): """ if self.exists(dataset_id): - raise DatasetCreationError("The dataset could not be created because it already exists") + raise DatasetCreationError( + "The dataset could not be created because it already exists") body = { 'datasetReference': { @@ -746,8 +789,8 @@ def create(self, dataset_id): try: self.service.datasets().insert( - projectId=self.project_id, - body=body).execute() + projectId=self.project_id, + body=body).execute() except self.http_error as ex: self.process_http_error(ex) @@ -763,12 +806,13 @@ def delete(self, dataset_id): """ if not self.exists(dataset_id): - raise NotFoundException("Dataset {0} does not exist".format(dataset_id)) + raise NotFoundException( + "Dataset {0} does not exist".format(dataset_id)) try: self.service.datasets().delete( - datasetId=dataset_id, - projectId=self.project_id).execute() + datasetId=dataset_id, + projectId=self.project_id).execute() except self.http_error as ex: self.process_http_error(ex) @@ -791,8 +835,8 @@ def tables(self, dataset_id): try: list_table_response = self.service.tables().list( - projectId=self.project_id, - datasetId=dataset_id).execute().get('tables', None) + projectId=self.project_id, + datasetId=dataset_id).execute().get('tables', None) if not list_table_response: return [] diff --git a/pandas/io/html.py b/pandas/io/html.py index f175702dedabc..b21f1ef7f160c 100644 --- a/pandas/io/html.py +++ b/pandas/io/html.py @@ -7,7 +7,6 @@ import re import numbers import collections -import warnings from distutils.version import LooseVersion @@ -26,6 +25,7 @@ _HAS_LXML = False _HAS_HTML5LIB = False + def _importers(): # import things we need # but make this done on a first use basis @@ -39,19 +39,19 @@ def _importers(): global _HAS_BS4, _HAS_LXML, _HAS_HTML5LIB try: - import bs4 + import bs4 # noqa _HAS_BS4 = True except ImportError: pass try: - import lxml + import lxml # noqa _HAS_LXML = True except ImportError: pass try: - import html5lib + import html5lib # noqa _HAS_HTML5LIB = True except ImportError: pass @@ -183,6 +183,7 @@ class _HtmlFrameParser(object): See each method's respective documentation for details on their functionality. """ + def __init__(self, io, match, attrs, encoding): self.io = io self.match = match @@ -385,6 +386,7 @@ class _BeautifulSoupHtml5LibFrameParser(_HtmlFrameParser): Documentation strings for this class are in the base class :class:`pandas.io.html._HtmlFrameParser`. """ + def __init__(self, *args, **kwargs): super(_BeautifulSoupHtml5LibFrameParser, self).__init__(*args, **kwargs) @@ -488,6 +490,7 @@ class _LxmlFrameParser(_HtmlFrameParser): Documentation strings for this class are in the base class :class:`_HtmlFrameParser`. """ + def __init__(self, *args, **kwargs): super(_LxmlFrameParser, self).__init__(*args, **kwargs) @@ -662,7 +665,8 @@ def _parser_dispatch(flavor): if not _HAS_HTML5LIB: raise ImportError("html5lib not found, please install it") if not _HAS_BS4: - raise ImportError("BeautifulSoup4 (bs4) not found, please install it") + raise ImportError( + "BeautifulSoup4 (bs4) not found, please install it") import bs4 if bs4.__version__ == LooseVersion('4.2.0'): raise ValueError("You're using a version" @@ -737,7 +741,7 @@ def _parse(flavor, io, match, header, index_col, skiprows, parse_dates=parse_dates, tupleize_cols=tupleize_cols, thousands=thousands)) - except StopIteration: # empty table + except StopIteration: # empty table continue return ret diff --git a/pandas/io/json.py b/pandas/io/json.py index f368f0e6cf28e..76cda87043a37 100644 --- a/pandas/io/json.py +++ b/pandas/io/json.py @@ -16,7 +16,8 @@ loads = _json.loads dumps = _json.dumps -### interface to/from ### + +# interface to/from def to_json(path_or_buf, obj, orient=None, date_format='epoch', @@ -115,7 +116,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, file. For file URLs, a host is expected. For instance, a local file could be ``file://localhost/path/to/table.json`` - orient + orient * `Series` @@ -151,15 +152,15 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, convert_dates : boolean, default True List of columns to parse for dates; If True, then try to parse datelike columns default is True; a column label is datelike if - + * it ends with ``'_at'``, - + * it ends with ``'_time'``, - + * it begins with ``'timestamp'``, - + * it is ``'modified'``, or - + * it is ``'date'`` keep_default_dates : boolean, default True @@ -190,7 +191,7 @@ def read_json(path_or_buf=None, orient=None, typ='frame', dtype=True, # if the filepath is too long will raise here # 5874 - except (TypeError,ValueError): + except (TypeError, ValueError): exists = False if exists: @@ -566,13 +567,13 @@ def is_ok(col): self._process_converter( lambda col, c: self._try_convert_to_date(c), - lambda col, c: ((self.keep_default_dates and is_ok(col)) - or col in convert_dates)) + lambda col, c: ((self.keep_default_dates and is_ok(col)) or + col in convert_dates)) - -#---------------------------------------------------------------------- +# --------------------------------------------------------------------- # JSON normalization routines + def nested_to_record(ds, prefix="", level=0): """a simplified json_normalize @@ -627,7 +628,7 @@ def nested_to_record(ds, prefix="", level=0): continue else: v = new_d.pop(k) - new_d.update(nested_to_record(v, newkey, level+1)) + new_d.update(nested_to_record(v, newkey, level + 1)) new_ds.append(new_d) if singleton: @@ -741,7 +742,7 @@ def _recursive_extract(data, path, seen_meta, level=0): seen_meta[key] = _pull_field(obj, val[-1]) _recursive_extract(obj[path[0]], path[1:], - seen_meta, level=level+1) + seen_meta, level=level + 1) else: for obj in data: recs = _pull_field(obj, path[0]) diff --git a/pandas/io/packers.py b/pandas/io/packers.py index 0ba1254659540..a16f3600736b8 100644 --- a/pandas/io/packers.py +++ b/pandas/io/packers.py @@ -1,12 +1,10 @@ """ Msgpack serializer support for reading and writing pandas data structures to disk -""" -# portions of msgpack_numpy package, by Lev Givon were incorporated -# into this module (and tests_packers.py) +portions of msgpack_numpy package, by Lev Givon were incorporated +into this module (and tests_packers.py) -""" License ======= @@ -46,12 +44,10 @@ import numpy as np from pandas import compat -from pandas.compat import u, PY3 -from pandas import ( - Timestamp, Period, Series, DataFrame, Panel, Panel4D, - Index, MultiIndex, Int64Index, RangeIndex, PeriodIndex, - DatetimeIndex, Float64Index, NaT -) +from pandas.compat import u +from pandas import (Timestamp, Period, Series, DataFrame, # noqa + Index, MultiIndex, Float64Index, Int64Index, + Panel, RangeIndex, PeriodIndex, DatetimeIndex) from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.core.generic import NDFrame @@ -174,7 +170,7 @@ def read(fh): # this is platform int, which we need to remap to np.int64 # for compat on windows platforms 7: np.dtype('int64'), -} + } def dtype_for(t): @@ -183,9 +179,9 @@ def dtype_for(t): return dtype_dict[t] return np.typeDict[t] -c2f_dict = {'complex': np.float64, +c2f_dict = {'complex': np.float64, 'complex128': np.float64, - 'complex64': np.float32} + 'complex64': np.float32} # numpy 1.6.1 compat if hasattr(np, 'float128'): @@ -322,16 +318,16 @@ def encode(obj): raise NotImplementedError( 'msgpack sparse series is not implemented' ) - #d = {'typ': 'sparse_series', + # d = {'typ': 'sparse_series', # 'klass': obj.__class__.__name__, # 'dtype': obj.dtype.name, # 'index': obj.index, # 'sp_index': obj.sp_index, # 'sp_values': convert(obj.sp_values), # 'compress': compressor} - #for f in ['name', 'fill_value', 'kind']: + # for f in ['name', 'fill_value', 'kind']: # d[f] = getattr(obj, f, None) - #return d + # return d else: return {'typ': 'series', 'klass': obj.__class__.__name__, @@ -345,33 +341,33 @@ def encode(obj): raise NotImplementedError( 'msgpack sparse frame is not implemented' ) - #d = {'typ': 'sparse_dataframe', + # d = {'typ': 'sparse_dataframe', # 'klass': obj.__class__.__name__, # 'columns': obj.columns} - #for f in ['default_fill_value', 'default_kind']: + # for f in ['default_fill_value', 'default_kind']: # d[f] = getattr(obj, f, None) - #d['data'] = dict([(name, ss) + # d['data'] = dict([(name, ss) # for name, ss in compat.iteritems(obj)]) - #return d + # return d elif isinstance(obj, SparsePanel): raise NotImplementedError( 'msgpack sparse frame is not implemented' ) - #d = {'typ': 'sparse_panel', + # d = {'typ': 'sparse_panel', # 'klass': obj.__class__.__name__, # 'items': obj.items} - #for f in ['default_fill_value', 'default_kind']: + # for f in ['default_fill_value', 'default_kind']: # d[f] = getattr(obj, f, None) - #d['data'] = dict([(name, df) + # d['data'] = dict([(name, df) # for name, df in compat.iteritems(obj)]) - #return d + # return d else: data = obj._data if not data.is_consolidated(): data = data.consolidate() - # the block manager + # the block manager return {'typ': 'block_manager', 'klass': obj.__class__.__name__, 'axes': data.axes, @@ -512,7 +508,8 @@ def create_block(b): values = unconvert(b['values'], dtype_for(b['dtype']), b['compress']).reshape(b['shape']) - # locs handles duplicate column names, and should be used instead of items; see GH 9618 + # locs handles duplicate column names, and should be used instead + # of items; see GH 9618 if 'locs' in b: placement = b['locs'] else: @@ -533,19 +530,19 @@ def create_block(b): return timedelta(*obj['data']) elif typ == 'timedelta64': return np.timedelta64(int(obj['data'])) - #elif typ == 'sparse_series': + # elif typ == 'sparse_series': # dtype = dtype_for(obj['dtype']) # return globals()[obj['klass']]( # unconvert(obj['sp_values'], dtype, obj['compress']), # sparse_index=obj['sp_index'], index=obj['index'], # fill_value=obj['fill_value'], kind=obj['kind'], name=obj['name']) - #elif typ == 'sparse_dataframe': + # elif typ == 'sparse_dataframe': # return globals()[obj['klass']]( # obj['data'], columns=obj['columns'], # default_fill_value=obj['default_fill_value'], # default_kind=obj['default_kind'] # ) - #elif typ == 'sparse_panel': + # elif typ == 'sparse_panel': # return globals()[obj['klass']]( # obj['data'], items=obj['items'], # default_fill_value=obj['default_fill_value'], diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 9d25eaecc6620..293a4701eb46d 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -24,7 +24,6 @@ from pandas.util.decorators import Appender import pandas.lib as lib -import pandas.tslib as tslib import pandas.parser as _parser @@ -70,13 +69,13 @@ class ParserWarning(Warning): If None defaults to Excel dialect. Ignored if sep longer than 1 char See csv.Dialect documentation for more details header : int, list of ints, default 'infer' - Row number(s) to use as the column names, and the start of the - data. Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly - pass ``header=0`` to be able to replace existing names. The header can be - a list of integers that specify row locations for a multi-index on the - columns E.g. [0,1,3]. Intervening rows that are not specified will be - skipped (e.g. 2 in this example are skipped). Note that this parameter - ignores commented lines and empty lines if ``skip_blank_lines=True``, so header=0 + Row number(s) to use as the column names, and the start of the data. + Defaults to 0 if no ``names`` passed, otherwise ``None``. Explicitly pass + ``header=0`` to be able to replace existing names. The header can be a list + of integers that specify row locations for a multi-index on the columns + E.g. [0,1,3]. Intervening rows that are not specified will be skipped + (e.g. 2 in this example are skipped). Note that this parameter ignores + commented lines and empty lines if ``skip_blank_lines=True``, so header=0 denotes the first line of data rather than the first line of the file. skiprows : list-like or integer, default None Line numbers to skip (0-indexed) or number of lines to skip (int) @@ -101,42 +100,56 @@ class ParserWarning(Warning): keep_default_na : bool, default True If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to -parse_dates : boolean, list of ints or names, list of lists, or dict, default False - If True -> try parsing the index. - If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date column. - If [[1, 3]] -> combine columns 1 and 3 and parse as a single date column. - {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' - A fast-path exists for iso8601-formatted dates. +parse_dates : various, default False + Acceptable input types + * boolean. If True -> try parsing the index. + + * list of ints or names + + * If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date + column. + + * list of lists + + * If [[1, 3]] -> combine columns 1 and 3 and parse as a single date + column. + + * dict + + * {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + + Note: A fast-path exists for iso8601-formatted dates. keep_date_col : boolean, default False If True and parse_dates specifies combining multiple columns then keep the original columns. date_parser : function, default None - Function to use for converting a sequence of string columns to an - array of datetime instances. The default uses dateutil.parser.parser - to do the conversion. Pandas will try to call date_parser in three different - ways, advancing to the next if an exception occurs: 1) Pass one or more arrays - (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the string - values from the columns defined by parse_dates into a single array and pass - that; and 3) call date_parser once for each row using one or more strings - (corresponding to the columns defined by parse_dates) as arguments. + Function to use for converting a sequence of string columns to an array of + datetime instances. The default uses dateutil.parser.parser to do the + conversion. Pandas will try to call date_parser in three different ways, + advancing to the next if an exception occurs: 1) Pass one or more arrays + (as defined by parse_dates) as arguments; 2) concatenate (row-wise) the + string values from the columns defined by parse_dates into a single array + and pass that; and 3) call date_parser once for each row using one or more + strings (corresponding to the columns defined by parse_dates) as arguments. dayfirst : boolean, default False DD/MM format dates, international and European format thousands : str, default None Thousands separator comment : str, default None - Indicates remainder of line should not be parsed. If found at the - beginning of a line, the line will be ignored altogether. This parameter - must be a single character. Like empty lines (as long as ``skip_blank_lines=True``), - fully commented lines are ignored by the parameter `header` - but not by `skiprows`. For example, if comment='#', parsing - '#empty\\na,b,c\\n1,2,3' with `header=0` will result in 'a,b,c' being + Indicates remainder of line should not be parsed. If found at the beginning + of a line, the line will be ignored altogether. This parameter must be a + single character. Like empty lines (as long as ``skip_blank_lines=True``), + fully commented lines are ignored by the parameter `header` but not by + `skiprows`. For example, if comment='#', parsing '#empty\\na,b,c\\n1,2,3' + with `header=0` will result in 'a,b,c' being treated as the header. decimal : str, default '.' Character to recognize as decimal point. E.g. use ',' for European data nrows : int, default None Number of rows of file to read. Useful for reading pieces of large files iterator : boolean, default False - Return TextFileReader object for iteration or getting chunks with ``get_chunk()``. + Return TextFileReader object for iteration or getting chunks with + ``get_chunk()``. chunksize : int, default None Return TextFileReader object for iteration. `See IO Tools docs for more information @@ -242,9 +255,10 @@ def _read(filepath_or_buffer, kwds): if skipfooter is not None: kwds['skip_footer'] = skipfooter - # If the input could be a filename, check for a recognizable compression extension. - # If we're reading from a URL, the `get_filepath_or_buffer` will use header info - # to determine compression, so use what it finds in that case. + # If the input could be a filename, check for a recognizable compression + # extension. If we're reading from a URL, the `get_filepath_or_buffer` + # will use header info to determine compression, so use what it finds in + # that case. inferred_compression = kwds.get('compression') if inferred_compression == 'infer': if isinstance(filepath_or_buffer, compat.string_types): @@ -257,10 +271,11 @@ def _read(filepath_or_buffer, kwds): else: inferred_compression = None - filepath_or_buffer, _, compression = get_filepath_or_buffer(filepath_or_buffer, - encoding, - compression=kwds.get('compression', None)) - kwds['compression'] = inferred_compression if compression == 'infer' else compression + filepath_or_buffer, _, compression = get_filepath_or_buffer( + filepath_or_buffer, encoding, + compression=kwds.get('compression', None)) + kwds['compression'] = (inferred_compression if compression == 'infer' + else compression) if kwds.get('date_parser', None) is not None: if isinstance(kwds['parse_dates'], bool): @@ -533,8 +548,8 @@ def read_fwf(filepath_or_buffer, colspecs='infer', widths=None, **kwds): # no longer excluding inf representations # '1.#INF','-1.#INF', '1.#INF000000', _NA_VALUES = set([ - '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', 'N/A', 'NA', '#NA', - 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' + '-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', '#N/A N/A', '#N/A', + 'N/A', 'NA', '#NA', 'NULL', 'NaN', '-NaN', 'nan', '-nan', '' ]) @@ -658,7 +673,8 @@ def _clean_options(self, options, engine): msg = ("Falling back to the 'python' engine because" " {reason}, but this causes {option!r} to be" " ignored as it is not supported by the 'python'" - " engine.").format(reason=fallback_reason, option=arg) + " engine.").format(reason=fallback_reason, + option=arg) if arg == 'dtype': msg += " (Note the 'converters' option provides"\ " similar functionality.)" @@ -1431,7 +1447,7 @@ def __init__(self, f, **kwds): if isinstance(f, compat.string_types): f = _get_handle(f, 'r', encoding=self.encoding, - compression=self.compression) + compression=self.compression) elif self.compression: f = _wrap_compressed(f, self.compression, self.encoding) # in Python 3, convert BytesIO or fileobjects passed with an encoding @@ -1472,8 +1488,8 @@ def __init__(self, f, **kwds): # multiple date column thing turning into a real spaghetti factory if not self._has_complex_date_col: - (index_names, - self.orig_names, self.columns) = self._get_index_name(self.columns) + (index_names, self.orig_names, self.columns) = ( + self._get_index_name(self.columns)) self._name_processed = True if self.index_names is None: self.index_names = index_names @@ -1697,7 +1713,7 @@ def _infer_columns(self): lc = len(this_columns) ic = (len(self.index_col) if self.index_col is not None else 0) - if lc != unnamed_count and lc-ic > unnamed_count: + if lc != unnamed_count and lc - ic > unnamed_count: clear_buffer = False this_columns = [None] * lc self.buf = [self.buf[-1]] @@ -1710,10 +1726,10 @@ def _infer_columns(self): self._clear_buffer() if names is not None: - if ((self.usecols is not None - and len(names) != len(self.usecols)) - or (self.usecols is None - and len(names) != len(columns[0]))): + if ((self.usecols is not None and + len(names) != len(self.usecols)) or + (self.usecols is None and + len(names) != len(columns[0]))): raise ValueError('Number of passed names did not match ' 'number of header fields in the file') if len(columns) > 1: @@ -1737,7 +1753,8 @@ def _infer_columns(self): num_original_columns = ncols if not names: if self.prefix: - columns = [['%s%d' % (self.prefix, i) for i in range(ncols)]] + columns = [['%s%d' % (self.prefix, i) + for i in range(ncols)]] else: columns = [lrange(ncols)] columns = self._handle_usecols(columns, columns[0]) @@ -1824,7 +1841,8 @@ def _next_line(self): orig_line = next(self.data) line = self._check_comments([orig_line])[0] self.pos += 1 - if not self.skip_blank_lines and (self._empty(orig_line) or line): + if (not self.skip_blank_lines and + (self._empty(orig_line) or line)): break elif self.skip_blank_lines: ret = self._check_empty([line]) @@ -1858,8 +1876,9 @@ def _check_empty(self, lines): ret = [] for l in lines: # Remove empty lines and lines with only one whitespace value - if len(l) > 1 or len(l) == 1 and (not isinstance(l[0], - compat.string_types) or l[0].strip()): + if (len(l) > 1 or len(l) == 1 and + (not isinstance(l[0], compat.string_types) or + l[0].strip())): ret.append(l) return ret @@ -1873,9 +1892,9 @@ def _check_thousands(self, lines): for i, x in enumerate(l): if (not isinstance(x, compat.string_types) or self.thousands not in x or - (self._no_thousands_columns - and i in self._no_thousands_columns) - or nonnum.search(x.strip())): + (self._no_thousands_columns and + i in self._no_thousands_columns) or + nonnum.search(x.strip())): rl.append(x) else: rl.append(x.replace(self.thousands, '')) @@ -1983,9 +2002,8 @@ def _rows_to_cols(self, content): if self._implicit_index: zipped_content = [ a for i, a in enumerate(zipped_content) - if (i < len(self.index_col) - or i - len(self.index_col) in self._col_indices) - ] + if (i < len(self.index_col) or + i - len(self.index_col) in self._col_indices)] else: zipped_content = [a for i, a in enumerate(zipped_content) if i in self._col_indices] @@ -2087,7 +2105,8 @@ def converter(*date_cols): lib.try_parse_dates(strs, dayfirst=dayfirst)) else: try: - result = tools.to_datetime(date_parser(*date_cols), errors='ignore') + result = tools.to_datetime( + date_parser(*date_cols), errors='ignore') if isinstance(result, datetime.datetime): raise Exception('scalar parser') return result @@ -2109,9 +2128,9 @@ def _process_date_conversion(data_dict, converter, parse_spec, keep_date_col=False): def _isindex(colspec): return ((isinstance(index_col, list) and - colspec in index_col) - or (isinstance(index_names, list) and - colspec in index_names)) + colspec in index_col) or + (isinstance(index_names, list) and + colspec in index_names)) new_cols = [] new_data = {} @@ -2262,13 +2281,14 @@ def _get_empty_meta(columns, index_col, index_names, dtype=None): index = Index([]) else: index = [np.empty(0, dtype=dtype.get(index_name, np.object)) - for index_name in index_names] + for index_name in index_names] index = MultiIndex.from_arrays(index, names=index_names) index_col.sort() for i, n in enumerate(index_col): - columns.pop(n-i) + columns.pop(n - i) - col_dict = dict((col_name, np.empty(0, dtype=dtype.get(col_name, np.object))) + col_dict = dict((col_name, + np.empty(0, dtype=dtype.get(col_name, np.object))) for col_name in columns) return index, columns, col_dict @@ -2315,8 +2335,6 @@ def _stringify_na_values(na_values): def _get_na_values(col, na_values, na_fvalues): if isinstance(na_values, dict): if col in na_values: - values = na_values[col] - fvalues = na_fvalues[col] return na_values[col], na_fvalues[col] else: return _NA_VALUES, set() @@ -2355,6 +2373,7 @@ class FixedWidthReader(object): """ A reader of fixed-width lines. """ + def __init__(self, f, colspecs, delimiter, comment): self.f = f self.buffer = None @@ -2426,6 +2445,7 @@ class FixedWidthFieldParser(PythonParser): Specialization that Converts fixed-width fields into DataFrames. See PythonParser for details. """ + def __init__(self, f, **kwds): # Support iterators, convert to a list. self.colspecs = kwds.pop('colspecs') diff --git a/pandas/io/pickle.py b/pandas/io/pickle.py index 52a9ef0370e9e..3b1338df525b2 100644 --- a/pandas/io/pickle.py +++ b/pandas/io/pickle.py @@ -1,5 +1,6 @@ from pandas.compat import cPickle as pkl, pickle_compat as pc, PY3 + def to_pickle(obj, path): """ Pickle (serialize) object to input file path @@ -44,8 +45,7 @@ def try_read(path, encoding=None): try: with open(path, 'rb') as fh: return pkl.load(fh) - except (Exception) as e: - + except Exception: # reg/patched pickle try: with open(path, 'rb') as fh: diff --git a/pandas/io/pytables.py b/pandas/io/pytables.py index fe063f5b4bc4d..9b59007d4268f 100644 --- a/pandas/io/pytables.py +++ b/pandas/io/pytables.py @@ -15,7 +15,8 @@ import numpy as np import pandas as pd from pandas import (Series, DataFrame, Panel, Panel4D, Index, - MultiIndex, Int64Index, Timestamp) + MultiIndex, Int64Index) +from pandas.core import config from pandas.sparse.api import SparseSeries, SparseDataFrame, SparsePanel from pandas.sparse.array import BlockIndex, IntIndex from pandas.tseries.api import PeriodIndex, DatetimeIndex @@ -25,10 +26,10 @@ from pandas.core.algorithms import match, unique from pandas.core.categorical import Categorical from pandas.core.common import _asarray_tuplesafe -from pandas.core.internals import (BlockManager, make_block, _block2d_to_blocknd, +from pandas.core.internals import (BlockManager, make_block, + _block2d_to_blocknd, _factor_indexer, _block_shape) from pandas.core.index import _ensure_index -from pandas.tseries.timedeltas import _coerce_scalar_to_timedelta_type import pandas.core.common as com from pandas.tools.merge import concat from pandas import compat @@ -41,16 +42,16 @@ import pandas.algos as algos import pandas.tslib as tslib -from contextlib import contextmanager from distutils.version import LooseVersion # versioning attribute _version = '0.15.2' -### encoding ### +# encoding # PY3 encoding if we don't specify _default_encoding = 'UTF-8' + def _ensure_decoded(s): """ if we have bytes, decode them to unicode """ if isinstance(s, np.bytes_): @@ -67,6 +68,7 @@ def _ensure_encoding(encoding): Term = Expr + def _ensure_term(where, scope_level): """ ensure that the where is a Term or a list of Term @@ -196,7 +198,6 @@ class DuplicateWarning(Warning): } # register our configuration options -from pandas.core import config dropna_doc = """ : boolean drop ALL nan rows when appending to a table @@ -219,6 +220,7 @@ class DuplicateWarning(Warning): _table_mod = None _table_file_open_policy_is_strict = False + def _tables(): global _table_mod global _table_file_open_policy_is_strict @@ -234,7 +236,8 @@ def _tables(): # return the file open policy; this changes as of pytables 3.1 # depending on the HDF5 version try: - _table_file_open_policy_is_strict = tables.file._FILE_OPEN_POLICY == 'strict' + _table_file_open_policy_is_strict = ( + tables.file._FILE_OPEN_POLICY == 'strict') except: pass @@ -242,6 +245,7 @@ def _tables(): # interface to/from ### + def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, append=None, **kwargs): """ store this object, close it if we opened it """ @@ -253,7 +257,7 @@ def to_hdf(path_or_buf, key, value, mode=None, complevel=None, complib=None, if isinstance(path_or_buf, string_types): with HDFStore(path_or_buf, mode=mode, complevel=complevel, - complib=complib) as store: + complib=complib) as store: f(store) else: f(path_or_buf) @@ -295,8 +299,8 @@ def read_hdf(path_or_buf, key=None, **kwargs): try: exists = os.path.exists(path_or_buf) - #if filepath is too long - except (TypeError,ValueError): + # if filepath is too long + except (TypeError, ValueError): exists = False if not exists: @@ -380,9 +384,10 @@ class HDFStore(StringMixin): def __init__(self, path, mode=None, complevel=None, complib=None, fletcher32=False, **kwargs): try: - import tables + import tables # noqa except ImportError as ex: # pragma: no cover - raise ImportError('HDFStore requires PyTables, "{ex}" problem importing'.format(ex=str(ex))) + raise ImportError('HDFStore requires PyTables, "{ex}" problem ' + 'importing'.format(ex=str(ex))) if complib not in (None, 'blosc', 'bzip2', 'lzo', 'zlib'): raise ValueError("complib only supports 'blosc', 'bzip2', lzo' " @@ -546,13 +551,17 @@ def open(self, mode='a', **kwargs): # trap PyTables >= 3.1 FILE_OPEN_POLICY exception # to provide an updated message if 'FILE_OPEN_POLICY' in str(e): - - e = ValueError("PyTables [{version}] no longer supports opening multiple files\n" - "even in read-only mode on this HDF5 version [{hdf_version}]. You can accept this\n" - "and not open the same file multiple times at once,\n" - "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 which allows\n" - "files to be opened multiple times at once\n".format(version=tables.__version__, - hdf_version=tables.get_hdf5_version())) + e = ValueError( + "PyTables [{version}] no longer supports opening multiple " + "files\n" + "even in read-only mode on this HDF5 version " + "[{hdf_version}]. You can accept this\n" + "and not open the same file multiple times at once,\n" + "upgrade the HDF5 version, or downgrade to PyTables 3.0.0 " + "which allows\n" + "files to be opened multiple times at once\n" + .format(version=tables.__version__, + hdf_version=tables.get_hdf5_version())) raise e @@ -662,9 +671,9 @@ def func(_start, _stop, _where): columns=columns, **kwargs) # create the iterator - it = TableIterator(self, s, func, where=where, nrows=s.nrows, start=start, - stop=stop, iterator=iterator, chunksize=chunksize, - auto_close=auto_close) + it = TableIterator(self, s, func, where=where, nrows=s.nrows, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, auto_close=auto_close) return it.get_result() @@ -751,7 +760,7 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, # validate rows nrows = None - for t, k in itertools.chain([(s,selector)], zip(tbls, keys)): + for t, k in itertools.chain([(s, selector)], zip(tbls, keys)): if t is None: raise KeyError("Invalid table [%s]" % k) if not t.is_table: @@ -771,21 +780,22 @@ def select_as_multiple(self, keys, where=None, selector=None, columns=None, def func(_start, _stop, _where): - # retrieve the objs, _where is always passed as a set of coordinates here - objs = [t.read(where=_where, columns=columns, **kwargs) for t in tbls] + # retrieve the objs, _where is always passed as a set of + # coordinates here + objs = [t.read(where=_where, columns=columns, **kwargs) + for t in tbls] # concat and return return concat(objs, axis=axis, verify_integrity=False).consolidate() # create the iterator - it = TableIterator(self, s, func, where=where, nrows=nrows, start=start, - stop=stop, iterator=iterator, chunksize=chunksize, - auto_close=auto_close) + it = TableIterator(self, s, func, where=where, nrows=nrows, + start=start, stop=stop, iterator=iterator, + chunksize=chunksize, auto_close=auto_close) return it.get_result(coordinates=True) - def put(self, key, value, format=None, append=False, **kwargs): """ Store object in HDFStore @@ -1290,8 +1300,8 @@ class TableIterator(object): def __init__(self, store, s, func, where, nrows, start=None, stop=None, iterator=False, chunksize=None, auto_close=False): self.store = store - self.s = s - self.func = func + self.s = s + self.func = func self.where = where self.nrows = nrows or 0 self.start = start or 0 @@ -1353,6 +1363,7 @@ def get_result(self, coordinates=False): self.close() return results + class IndexCol(StringMixin): """ an index column description class @@ -1624,14 +1635,15 @@ def validate_metadata(self, handler): new_metadata = self.metadata cur_metadata = handler.read_metadata(self.cname) if new_metadata is not None and cur_metadata is not None \ - and not com.array_equivalent(new_metadata, cur_metadata): - raise ValueError("cannot append a categorical with different categories" - " to the existing") + and not com.array_equivalent(new_metadata, cur_metadata): + raise ValueError("cannot append a categorical with " + "different categories to the existing") def write_metadata(self, handler): """ set the meta data """ if self.metadata is not None: - handler.write_metadata(self.cname,self.metadata) + handler.write_metadata(self.cname, self.metadata) + class GenericIndexCol(IndexCol): @@ -1669,7 +1681,7 @@ class DataCol(IndexCol): """ is_an_indexable = False is_data_indexable = False - _info_fields = ['tz','ordered'] + _info_fields = ['tz', 'ordered'] @classmethod def create_for_block( @@ -1694,9 +1706,10 @@ def create_for_block( return cls(name=name, cname=cname, **kwargs) def __init__(self, values=None, kind=None, typ=None, - cname=None, data=None, meta=None, metadata=None, block=None, **kwargs): - super(DataCol, self).__init__( - values=values, kind=kind, typ=typ, cname=cname, **kwargs) + cname=None, data=None, meta=None, metadata=None, + block=None, **kwargs): + super(DataCol, self).__init__(values=values, kind=kind, typ=typ, + cname=cname, **kwargs) self.dtype = None self.dtype_attr = u("%s_dtype" % self.name) self.meta = meta @@ -1737,7 +1750,7 @@ def take_data(self): def set_metadata(self, metadata): """ record the metadata """ if metadata is not None: - metadata = np.array(metadata,copy=False).ravel() + metadata = np.array(metadata, copy=False).ravel() self.metadata = metadata def set_kind(self): @@ -1776,7 +1789,8 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, # short-cut certain block types if block.is_categorical: - return self.set_atom_categorical(block, items=block_items, info=info) + return self.set_atom_categorical(block, items=block_items, + info=info) elif block.is_datetimetz: return self.set_atom_datetime64tz(block, info=info) elif block.is_datetime: @@ -1800,7 +1814,7 @@ def set_atom(self, block, block_items, existing_col, min_itemsize, raise TypeError( "too many timezones in this block, create separate " "data columns" - ) + ) elif inferred_type == 'unicode': raise TypeError( "[unicode] is not implemented as a table column") @@ -1886,7 +1900,8 @@ def get_atom_data(self, block, kind=None): def set_atom_complex(self, block): self.kind = block.dtype.name itemsize = int(self.kind.split('complex')[-1]) // 8 - self.typ = _tables().ComplexCol(itemsize=itemsize, shape=block.shape[0]) + self.typ = _tables().ComplexCol( + itemsize=itemsize, shape=block.shape[0]) self.set_data(block.values.astype(self.typ.type, copy=False)) def set_atom_data(self, block): @@ -2239,7 +2254,10 @@ def write(self, **kwargs): "cannot write on an abstract storer: sublcasses should implement") def delete(self, where=None, start=None, stop=None, **kwargs): - """ support fully deleting the node in its entirety (only) - where specification must be None """ + """ + support fully deleting the node in its entirety (only) - where + specification must be None + """ if where is None and start is None and stop is None: self._handle.remove_node(self.group, recursive=True) return None @@ -2252,7 +2270,7 @@ class GenericFixed(Fixed): """ a generified fixed version """ _index_type_map = {DatetimeIndex: 'datetime', PeriodIndex: 'period'} _reverse_index_map = dict([(v, k) - for k, v in compat.iteritems(_index_type_map)]) + for k, v in compat.iteritems(_index_type_map)]) attributes = [] # indexer helpders @@ -2515,8 +2533,8 @@ def write_array(self, key, value, items=None): # create an empty chunked array and fill it from value if not empty_array: ca = self._handle.create_carray(self.group, key, atom, - value.shape, - filters=self._filters) + value.shape, + filters=self._filters) ca[:] = value getattr(self.group, key)._v_attrs.transposed = transposed @@ -2543,14 +2561,15 @@ def write_array(self, key, value, items=None): warnings.warn(ws, PerformanceWarning, stacklevel=7) vlarr = self._handle.create_vlarray(self.group, key, - _tables().ObjectAtom()) + _tables().ObjectAtom()) vlarr.append(value) else: if empty_array: self.write_array_empty(key, value) else: if com.is_datetime64_dtype(value.dtype): - self._handle.create_array(self.group, key, value.view('i8')) + self._handle.create_array( + self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'datetime64' elif com.is_datetime64tz_dtype(value.dtype): @@ -2563,7 +2582,8 @@ def write_array(self, key, value, items=None): node._v_attrs.tz = _get_tz(value.tz) node._v_attrs.value_type = 'datetime64' elif com.is_timedelta64_dtype(value.dtype): - self._handle.create_array(self.group, key, value.view('i8')) + self._handle.create_array( + self.group, key, value.view('i8')) getattr( self.group, key)._v_attrs.value_type = 'timedelta64' else: @@ -2778,7 +2798,8 @@ def write(self, obj, **kwargs): for i, ax in enumerate(data.axes): if i == 0: if not ax.is_unique: - raise ValueError("Columns index has to be unique for fixed format") + raise ValueError( + "Columns index has to be unique for fixed format") self.write_index('axis%d' % i, ax) # Supporting mixed-type DataFrame objects...nontrivial @@ -2911,7 +2932,8 @@ def is_multi_index(self): def validate_metadata(self, existing): """ create / validate metadata """ - self.metadata = [ c.name for c in self.values_axes if c.metadata is not None ] + self.metadata = [ + c.name for c in self.values_axes if c.metadata is not None] def validate_multiindex(self, obj): """validate that we can store the multi-index; reset and return the @@ -3008,11 +3030,11 @@ def write_metadata(self, key, values): """ values = Series(values) self.parent.put(self._get_metadata_path(key), values, format='table', - encoding=self.encoding, nan_rep=self.nan_rep) + encoding=self.encoding, nan_rep=self.nan_rep) def read_metadata(self, key): """ return the meta data array for this key """ - if getattr(getattr(self.group,'meta',None),key,None) is not None: + if getattr(getattr(self.group, 'meta', None), key, None) is not None: return self.parent.select(self._get_metadata_path(key)) return None @@ -3173,11 +3195,13 @@ def create_index(self, columns=None, optlevel=None, kind=None): # create the index if not v.is_indexed: if v.type.startswith('complex'): - raise TypeError('Columns containing complex values can be stored but cannot' - ' be indexed when using table format. Either use fixed ' - 'format, set index=False, or do not include the columns ' - 'containing complex values to data_columns when ' - 'initializing the table.') + raise TypeError( + 'Columns containing complex values can be stored ' + 'but cannot' + ' be indexed when using table format. Either use ' + 'fixed format, set index=False, or do not include ' + 'the columns containing complex values to ' + 'data_columns when initializing the table.') v.create_index(**kw) def read_axes(self, where, **kwargs): @@ -3553,8 +3577,10 @@ def read_coordinates(self, where=None, start=None, stop=None, **kwargs): coords = self.selection.select_coords() if self.selection.filter is not None: for field, op, filt in self.selection.filter.format(): - data = self.read_column(field, start=coords.min(), stop=coords.max()+1) - coords = coords[op(data.iloc[coords-coords.min()], filt).values] + data = self.read_column( + field, start=coords.min(), stop=coords.max() + 1) + coords = coords[ + op(data.iloc[coords - coords.min()], filt).values] return Index(coords) @@ -3643,7 +3669,8 @@ def read(self, where=None, columns=None, **kwargs): if not self.read_axes(where=where, **kwargs): return None - factors = [Categorical.from_array(a.values, ordered=True) for a in self.index_axes] + factors = [Categorical.from_array( + a.values, ordered=True) for a in self.index_axes] levels = [f.categories for f in factors] N = [len(f.categories) for f in factors] labels = [f.codes for f in factors] @@ -3664,7 +3691,8 @@ def read(self, where=None, columns=None, **kwargs): # the data need to be sorted sorted_values = c.take_data().take(sorter, axis=0) if sorted_values.ndim == 1: - sorted_values = sorted_values.reshape((sorted_values.shape[0],1)) + sorted_values = sorted_values.reshape( + (sorted_values.shape[0], 1)) take_labels = [l.take(sorter) for l in labels] items = Index(c.values) @@ -3771,10 +3799,10 @@ def write(self, obj, axes=None, append=False, complib=None, self.set_attrs() # create the table - table = self._handle.create_table(self.group, **options) - + self._handle.create_table(self.group, **options) else: - table = self.table + pass + # table = self.table # update my info self.set_info() @@ -3828,7 +3856,7 @@ def write_data(self, chunksize, dropna=False): if i < nindexes - 1: repeater = np.prod([indexes[bi].shape[0] - for bi in range(i + 1, nindexes)]) + for bi in range(i + 1, nindexes)]) idx = np.repeat(idx, repeater) bindexes.append(idx) @@ -3847,7 +3875,7 @@ def write_data(self, chunksize, dropna=False): if chunksize is None: chunksize = 100000 - rows = np.empty(min(chunksize,nrows), dtype=self.dtype) + rows = np.empty(min(chunksize, nrows), dtype=self.dtype) chunks = int(nrows / chunksize) + 1 for i in range(chunks): start_i = i * chunksize @@ -3928,7 +3956,8 @@ def delete(self, where=None, start=None, stop=None, **kwargs): # create the selection table = self.table - self.selection = Selection(self, where, start=start, stop=stop, **kwargs) + self.selection = Selection( + self, where, start=start, stop=stop, **kwargs) values = self.selection.select_coords() # delete the rows in reverse order @@ -3958,7 +3987,7 @@ def delete(self, where=None, start=None, stop=None, **kwargs): for g in reversed(groups): rows = l.take(lrange(g, pg)) table.remove_rows(start=rows[rows.index[0] - ], stop=rows[rows.index[-1]] + 1) + ], stop=rows[rows.index[-1]] + 1) pg = g self.table.flush() @@ -4177,6 +4206,7 @@ def read(self, **kwargs): return df + class AppendablePanelTable(AppendableTable): """ suppor the new appendable table formats """ @@ -4232,7 +4262,9 @@ def _get_info(info, name): idx = info[name] = dict() return idx -### tz to/from coercion ### +# tz to/from coercion + + def _get_tz(tz): """ for a tz-aware type, return an encoded zone """ zone = tslib.get_timezone(tz) @@ -4240,6 +4272,7 @@ def _get_tz(tz): zone = tslib.tot_seconds(tz.utcoffset()) return zone + def _set_tz(values, tz, preserve_UTC=False, coerce=False): """ coerce the values to a DatetimeIndex if tz is set @@ -4267,6 +4300,7 @@ def _set_tz(values, tz, preserve_UTC=False, coerce=False): return values + def _convert_index(index, encoding=None, format_type=None): index_name = getattr(index, 'name', None) @@ -4393,7 +4427,8 @@ def _unconvert_index_legacy(data, kind, legacy=False, encoding=None): def _convert_string_array(data, encoding, itemsize=None): """ - we take a string-like that is object dtype and coerce to a fixed size string type + we take a string-like that is object dtype and coerce to a fixed size + string type Parameters ---------- @@ -4408,7 +4443,8 @@ def _convert_string_array(data, encoding, itemsize=None): # encode if needed if encoding is not None and len(data): - data = Series(data.ravel()).str.encode(encoding).values.reshape(data.shape) + data = Series(data.ravel()).str.encode( + encoding).values.reshape(data.shape) # create the sized dtype if itemsize is None: @@ -4417,6 +4453,7 @@ def _convert_string_array(data, encoding, itemsize=None): data = np.asarray(data, dtype="S%d" % itemsize) return data + def _unconvert_string_array(data, nan_rep=None, encoding=None): """ inverse of _convert_string_array @@ -4552,7 +4589,7 @@ def generate(self, where): q = self.table.queryables() try: return Expr(where, queryables=q, encoding=self.table.encoding) - except NameError as detail: + except NameError: # raise a nice message, suggesting that the user should use # data_columns raise ValueError( @@ -4572,7 +4609,8 @@ def select(self): """ if self.condition is not None: return self.table.table.read_where(self.condition.format(), - start=self.start, stop=self.stop) + start=self.start, + stop=self.stop) elif self.coordinates is not None: return self.table.table.read_coordinates(self.coordinates) return self.table.table.read(start=self.start, stop=self.stop) @@ -4594,8 +4632,8 @@ def select_coords(self): if self.condition is not None: return self.table.table.get_where_list(self.condition.format(), - start=start, stop=stop, - sort=True) + start=start, stop=stop, + sort=True) elif self.coordinates is not None: return self.coordinates @@ -4603,6 +4641,7 @@ def select_coords(self): # utilities ### + def timeit(key, df, fn=None, remove=True, **kwargs): if fn is None: fn = 'timeit.h5' diff --git a/pandas/io/sas.py b/pandas/io/sas.py index 006c2aaf55ca8..39e83b7715cda 100644 --- a/pandas/io/sas.py +++ b/pandas/io/sas.py @@ -16,15 +16,18 @@ import numpy as np from pandas.util.decorators import Appender -_correct_line1 = "HEADER RECORD*******LIBRARY HEADER RECORD!!!!!!!000000000000000000000000000000 " -_correct_header1 = "HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!000000000000000001600000000" -_correct_header2 = "HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!000000000000000000000000000000 " -_correct_obs_header = "HEADER RECORD*******OBS HEADER RECORD!!!!!!!000000000000000000000000000000 " +_correct_line1 = ("HEADER RECORD*******LIBRARY HEADER RECORD" + "!!!!!!!000000000000000000000000000000 ") +_correct_header1 = ("HEADER RECORD*******MEMBER HEADER RECORD!!!!!!!" + "000000000000000001600000000") +_correct_header2 = ("HEADER RECORD*******DSCRPTR HEADER RECORD!!!!!!!" + "000000000000000000000000000000 ") +_correct_obs_header = ("HEADER RECORD*******OBS HEADER RECORD!!!!!!!" + "000000000000000000000000000000 ") _fieldkeys = ['ntype', 'nhfun', 'field_length', 'nvar0', 'name', 'label', 'nform', 'nfl', 'num_decimals', 'nfj', 'nfill', 'niform', 'nifl', 'nifd', 'npos', '_'] - _base_params_doc = """\ Parameters ---------- @@ -110,13 +113,14 @@ @Appender(_read_sas_doc) -def read_sas(filepath_or_buffer, format='xport', index=None, encoding='ISO-8859-1', - chunksize=None, iterator=False): +def read_sas(filepath_or_buffer, format='xport', index=None, + encoding='ISO-8859-1', chunksize=None, iterator=False): format = format.lower() if format == 'xport': - reader = XportReader(filepath_or_buffer, index=index, encoding=encoding, + reader = XportReader(filepath_or_buffer, index=index, + encoding=encoding, chunksize=chunksize) else: raise ValueError('only xport format is supported') @@ -130,7 +134,8 @@ def read_sas(filepath_or_buffer, format='xport', index=None, encoding='ISO-8859- def _parse_date(datestr): """ Given a date in xport format, return Python date. """ try: - return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") # e.g. "16FEB11:10:07:55" + # e.g. "16FEB11:10:07:55" + return datetime.strptime(datestr, "%d%b%y:%H:%M:%S") except ValueError: return pd.NaT @@ -151,7 +156,7 @@ def _split_line(s, parts): out = {} start = 0 for name, length in parts: - out[name] = s[start:start+length].strip() + out[name] = s[start:start + length].strip() start += length del out['_'] return out @@ -225,7 +230,8 @@ def _parse_float_vec(vec): # incremented by 1 and the fraction bits left 4 positions to the # right of the radix point. (had to add >> 24 because C treats & # 0x7f as 0x7f000000 and Python doesn't) - ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) + shift + 1023) << 20) | (xport1 & 0x80000000) + ieee1 |= ((((((xport1 >> 24) & 0x7f) - 65) << 2) + + shift + 1023) << 20) | (xport1 & 0x80000000) ieee = np.empty((len(ieee1),), dtype='>u4,>u4') ieee['f0'] = ieee1 @@ -236,11 +242,9 @@ def _parse_float_vec(vec): return ieee - class XportReader(object): __doc__ = _xport_reader_doc - def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', chunksize=None): @@ -266,11 +270,9 @@ def __init__(self, filepath_or_buffer, index=None, encoding='ISO-8859-1', self._read_header() - def _get_row(self): return self.filepath_or_buffer.read(80).decode() - def _read_header(self): self.filepath_or_buffer.seek(0) @@ -280,8 +282,8 @@ def _read_header(self): raise ValueError("Header record is not an XPORT file.") line2 = self._get_row() - file_info = _split_line(line2, [['prefix', 24], ['version', 8], ['OS', 8], - ['_', 24], ['created', 16]]) + file_info = _split_line(line2, [['prefix', 24], ['version', 8], + ['OS', 8], ['_', 24], ['created', 16]]) if file_info['prefix'] != "SAS SAS SASLIB": raise ValueError("Header record has invalid prefix.") file_info['created'] = _parse_date(file_info['created']) @@ -293,16 +295,22 @@ def _read_header(self): # read member header header1 = self._get_row() header2 = self._get_row() - if not header1.startswith(_correct_header1) or not header2 == _correct_header2: + if (not header1.startswith(_correct_header1) or + not header2 == _correct_header2): raise ValueError("Member header not found.") - fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135 + fieldnamelength = int(header1[-5:-2]) # usually 140, could be 135 # member info - member_info = _split_line(self._get_row(), [['prefix', 8], ['set_name', 8], - ['sasdata', 8],['version', 8], - ['OS', 8],['_', 24],['created', 16]]) - member_info.update( _split_line(self._get_row(), [['modified', 16], ['_', 16], - ['label', 40],['type', 8]])) + member_info = _split_line(self._get_row(), [['prefix', 8], + ['set_name', 8], + ['sasdata', 8], + ['version', 8], + ['OS', 8], ['_', 24], + ['created', 16]]) + member_info.update(_split_line(self._get_row(), + [['modified', 16], + ['_', 16], + ['label', 40], ['type', 8]])) member_info['modified'] = _parse_date(member_info['modified']) member_info['created'] = _parse_date(member_info['created']) self.member_info = member_info @@ -310,15 +318,16 @@ def _read_header(self): # read field names types = {1: 'numeric', 2: 'char'} fieldcount = int(self._get_row()[54:58]) - datalength = fieldnamelength*fieldcount - if datalength % 80: # round up to nearest 80 - datalength += 80 - datalength%80 + datalength = fieldnamelength * fieldcount + if datalength % 80: # round up to nearest 80 + datalength += 80 - datalength % 80 fielddata = self.filepath_or_buffer.read(datalength) fields = [] obs_length = 0 while len(fielddata) >= fieldnamelength: # pull data for one field - field, fielddata = (fielddata[:fieldnamelength], fielddata[fieldnamelength:]) + field, fielddata = ( + fielddata[:fieldnamelength], fielddata[fieldnamelength:]) # rest at end gets ignored, so if field is short, pad out # to match struct pattern below @@ -330,7 +339,8 @@ def _read_header(self): field['ntype'] = types[field['ntype']] fl = field['field_length'] if field['ntype'] == 'numeric' and ((fl < 2) or (fl > 8)): - raise TypeError("Floating point field width %d is not between 2 and 8." % fw) + raise TypeError("Floating point field width %d is not between " + "2 and 8." % fl) for k, v in field.items(): try: @@ -354,12 +364,11 @@ def _read_header(self): # Setup the dtype. dtypel = [] - for i,field in enumerate(self.fields): + for i, field in enumerate(self.fields): dtypel.append(('s' + str(i), "S" + str(field['field_length']))) dtype = np.dtype(dtypel) self._dtype = dtype - def __iter__(self): try: if self._chunksize: @@ -370,20 +379,22 @@ def __iter__(self): except StopIteration: pass - def _record_count(self): """ Get number of records in file. - This is maybe suboptimal because we have to seek to the end of the file. + This is maybe suboptimal because we have to seek to the end of the + file. Side effect: returns file position to record_start. """ self.filepath_or_buffer.seek(0, 2) - total_records_length = self.filepath_or_buffer.tell() - self.record_start + total_records_length = (self.filepath_or_buffer.tell() - + self.record_start) if total_records_length % 80 != 0: + import warnings warnings.warn("xport file may be corrupted") if self.record_length > 80: @@ -406,7 +417,6 @@ def _record_count(self): return (total_records_length - tail_pad) // self.record_length - def get_chunk(self, size=None): """ Reads lines from Xport file and returns as dataframe @@ -424,7 +434,6 @@ def get_chunk(self, size=None): size = self._chunksize return self.read(nrows=size) - def _missing_double(self, vec): v = vec.view(dtype='u1,u1,u2,u4') miss = (v['f1'] == 0) & (v['f2'] == 0) & (v['f3'] == 0) @@ -433,7 +442,6 @@ def _missing_double(self, vec): miss &= miss1 return miss - @Appender(_read_method_doc) def read(self, nrows=None): @@ -448,11 +456,12 @@ def read(self, nrows=None): data = np.frombuffer(raw, dtype=self._dtype, count=read_lines) df = pd.DataFrame(index=range(read_lines)) - for j,x in enumerate(self.columns): + for j, x in enumerate(self.columns): vec = data['s%d' % j] ntype = self.fields[j]['ntype'] if ntype == "numeric": - vec = _handle_truncated_float_vec(vec, self.fields[j]['field_length']) + vec = _handle_truncated_float_vec( + vec, self.fields[j]['field_length']) miss = self._missing_double(vec) v = _parse_float_vec(vec) v[miss] = np.nan diff --git a/pandas/io/sql.py b/pandas/io/sql.py index 95a6d02b1ccb6..63725988c8065 100644 --- a/pandas/io/sql.py +++ b/pandas/io/sql.py @@ -14,7 +14,8 @@ import pandas.lib as lib import pandas.core.common as com -from pandas.compat import lzip, map, zip, raise_with_traceback, string_types, text_type +from pandas.compat import (lzip, map, zip, raise_with_traceback, + string_types, text_type) from pandas.core.api import DataFrame, Series from pandas.core.common import isnull from pandas.core.base import PandasObject @@ -33,8 +34,8 @@ class DatabaseError(IOError): pass -#------------------------------------------------------------------------------ -#--- Helper functions +# ----------------------------------------------------------------------------- +# -- Helper functions _SQLALCHEMY_INSTALLED = None @@ -85,15 +86,16 @@ def _handle_date_column(col, format=None): else: if format in ['D', 's', 'ms', 'us', 'ns']: return to_datetime(col, errors='coerce', unit=format, utc=True) - elif (issubclass(col.dtype.type, np.floating) - or issubclass(col.dtype.type, np.integer)): + elif (issubclass(col.dtype.type, np.floating) or + issubclass(col.dtype.type, np.integer)): # parse dates as timestamp format = 's' if format is None else format return to_datetime(col, errors='coerce', unit=format, utc=True) elif com.is_datetime64tz_dtype(col): # coerce to UTC timezone # GH11216 - return to_datetime(col,errors='coerce').astype('datetime64[ns, UTC]') + return (to_datetime(col, errors='coerce') + .astype('datetime64[ns, UTC]')) else: return to_datetime(col, errors='coerce', format=format, utc=True) @@ -118,7 +120,6 @@ def _parse_date_columns(data_frame, parse_dates): fmt = None data_frame[col_name] = _handle_date_column(df_col, format=fmt) - # we want to coerce datetime64_tz dtypes for now # we could in theory do a 'nice' conversion from a FixedOffset tz # GH11216 @@ -152,7 +153,7 @@ def execute(sql, con, cur=None, params=None): ---------- sql : string Query to be executed - con : SQLAlchemy connectable(engine/connection) or sqlite3 DBAPI2 connection + con : SQLAlchemy connectable(engine/connection) or sqlite3 connection Using SQLAlchemy makes it possible to use any DB supported by that library. If a DBAPI2 object, only sqlite3 is supported. @@ -172,8 +173,8 @@ def execute(sql, con, cur=None, params=None): return pandas_sql.execute(*args) -#------------------------------------------------------------------------------ -#--- Deprecated tquery and uquery +# ----------------------------------------------------------------------------- +# -- Deprecated tquery and uquery def _safe_fetch(cur): try: @@ -204,7 +205,8 @@ def tquery(sql, con=None, cur=None, retry=True): SQL query to be executed con: DBAPI2 connection, default: None cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure, default: True + retry: boolean value to specify whether to retry after failure + default: True Returns ------- @@ -258,7 +260,8 @@ def uquery(sql, con=None, cur=None, retry=True, params=None): SQL query to be executed con: DBAPI2 connection, default: None cur: deprecated, cursor is obtained from connection, default: None - retry: boolean value to specify whether to retry after failure, default: True + retry: boolean value to specify whether to retry after failure + default: True params: list or tuple, optional, default: None List of parameters to pass to execute method. @@ -289,8 +292,8 @@ def uquery(sql, con=None, cur=None, retry=True, params=None): return result -#------------------------------------------------------------------------------ -#--- Read and write to DataFrames +# ----------------------------------------------------------------------------- +# -- Read and write to DataFrames def read_sql_table(table_name, con, schema=None, index_col=None, coerce_float=True, parse_dates=None, columns=None, @@ -601,7 +604,8 @@ def has_table(table_name, con, flavor='sqlite', schema=None): _MYSQL_WARNING = ("The 'mysql' flavor with DBAPI connection is deprecated " "and will be removed in future versions. " - "MySQL will be further supported with SQLAlchemy connectables.") + "MySQL will be further supported with SQLAlchemy " + "connectables.") def _engine_builder(con): @@ -609,17 +613,18 @@ def _engine_builder(con): Returns a SQLAlchemy engine from a URI (if con is a string) else it just return con without modifying it """ + global _SQLALCHEMY_INSTALLED if isinstance(con, string_types): try: import sqlalchemy con = sqlalchemy.create_engine(con) return con - except ImportError: _SQLALCHEMY_INSTALLED = False return con + def pandasSQL_builder(con, flavor=None, schema=None, meta=None, is_cursor=False): """ @@ -646,6 +651,7 @@ class SQLTable(PandasObject): pass them between functions all the time. """ # TODO: support for multiIndex + def __init__(self, name, pandas_sql_engine, frame=None, index=True, if_exists='fail', prefix='pandas', index_label=None, schema=None, keys=None, dtype=None): @@ -829,8 +835,8 @@ def _index_name(self, index, index_label): else: return index_label # return the used column labels for the index columns - if (nlevels == 1 and 'index' not in self.frame.columns - and self.frame.index.name is None): + if (nlevels == 1 and 'index' not in self.frame.columns and + self.frame.index.name is None): return ['index'] else: return [l if l is not None else "level_{0}".format(i) @@ -857,7 +863,7 @@ def _get_column_names_and_types(self, dtype_mapper): dtype_mapper(self.frame.iloc[:, i]), False) for i in range(len(self.frame.columns)) - ] + ] return column_names_and_types @@ -913,7 +919,8 @@ def _harmonize_columns(self, parse_dates=None): # the type the dataframe column should have col_type = self._get_dtype(sql_col.type) - if col_type is datetime or col_type is date or col_type is DatetimeTZDtype: + if (col_type is datetime or col_type is date or + col_type is DatetimeTZDtype): self.frame[col_name] = _handle_date_column(df_col) elif col_type is float: @@ -923,7 +930,8 @@ def _harmonize_columns(self, parse_dates=None): elif len(df_col) == df_col.count(): # No NA values, can convert ints and bools if col_type is np.dtype('int64') or col_type is bool: - self.frame[col_name] = df_col.astype(col_type, copy=False) + self.frame[col_name] = df_col.astype( + col_type, copy=False) # Handle date parsing if col_name in parse_dates: @@ -959,12 +967,13 @@ def _sqlalchemy_type(self, col): col_type = self._get_notnull_col_dtype(col) - from sqlalchemy.types import (BigInteger, Integer, Float, Text, Boolean, - DateTime, Date, Time) + from sqlalchemy.types import (BigInteger, Integer, Float, + Text, Boolean, + DateTime, Date, Time) if col_type == 'datetime64' or col_type == 'datetime': try: - tz = col.tzinfo + tz = col.tzinfo # noqa return DateTime(timezone=True) except: return DateTime @@ -995,7 +1004,8 @@ def _sqlalchemy_type(self, col): return Text def _get_dtype(self, sqltype): - from sqlalchemy.types import Integer, Float, Boolean, DateTime, Date, TIMESTAMP + from sqlalchemy.types import (Integer, Float, Boolean, DateTime, + Date, TIMESTAMP) if isinstance(sqltype, Float): return float @@ -1023,12 +1033,12 @@ class PandasSQL(PandasObject): """ def read_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy connectable" - " or connection+sql flavor") + raise ValueError("PandasSQL must be created with an SQLAlchemy " + "connectable or connection+sql flavor") def to_sql(self, *args, **kwargs): - raise ValueError("PandasSQL must be created with an SQLAlchemy connectable" - " or connection+sql flavor") + raise ValueError("PandasSQL must be created with an SQLAlchemy " + "connectable or connection+sql flavor") class SQLDatabase(PandasSQL): @@ -1158,10 +1168,10 @@ def read_query(self, sql, index_col=None, coerce_float=True, - Dict of ``{column_name: format string}`` where format string is strftime compatible in case of parsing string times or is one of (D, s, ns, ms, us) in case of parsing integer timestamps - - Dict of ``{column_name: arg dict}``, where the arg dict corresponds - to the keyword arguments of :func:`pandas.to_datetime` - Especially useful with databases without native Datetime support, - such as SQLite + - Dict of ``{column_name: arg dict}``, where the arg dict + corresponds to the keyword arguments of + :func:`pandas.to_datetime` Especially useful with databases + without native Datetime support, such as SQLite chunksize : int, default None If specified, return an iterator where `chunksize` is the number of rows to include in each chunk. @@ -1250,7 +1260,8 @@ def to_sql(self, frame, name, if_exists='fail', index=True, warnings.warn("The provided table name '{0}' is not found exactly " "as such in the database after writing the table, " "possibly due to case sensitivity issues. Consider " - "using lower case table names.".format(name), UserWarning) + "using lower case table names.".format(name), + UserWarning) @property def tables(self): @@ -1334,6 +1345,7 @@ def _get_unicode_name(name): raise ValueError("Cannot convert identifier to UTF-8: '%s'" % name) return uname + def _get_valid_mysql_name(name): # Filter for unquoted identifiers # See http://dev.mysql.com/doc/refman/5.0/en/identifiers.html @@ -1351,7 +1363,8 @@ def _get_valid_mysql_name(name): def _get_valid_sqlite_name(name): - # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings-for-sqlite-table-column-names-in-python + # See http://stackoverflow.com/questions/6514274/how-do-you-escape-strings\ + # -for-sqlite-table-column-names-in-python # Ensure the string can be encoded as UTF-8. # Ensure the string does not include any NUL characters. # Replace all " with "". @@ -1447,7 +1460,7 @@ def _create_table_setup(self): cnames_br = ", ".join([escape(c) for c in keys]) create_tbl_stmts.append( "CONSTRAINT {tbl}_pk PRIMARY KEY ({cnames_br})".format( - tbl=self.name, cnames_br=cnames_br)) + tbl=self.name, cnames_br=cnames_br)) create_stmts = ["CREATE TABLE " + escape(self.name) + " (\n" + ',\n '.join(create_tbl_stmts) + "\n)"] @@ -1458,7 +1471,7 @@ def _create_table_setup(self): cnames = "_".join(ix_cols) cnames_br = ",".join([escape(c) for c in ix_cols]) create_stmts.append( - "CREATE INDEX " + escape("ix_"+self.name+"_"+cnames) + + "CREATE INDEX " + escape("ix_" + self.name + "_" + cnames) + "ON " + escape(self.name) + " (" + cnames_br + ")") return create_stmts @@ -1546,7 +1559,8 @@ def execute(self, *args, **kwargs): " to rollback" % (args[0], exc)) raise_with_traceback(ex) - ex = DatabaseError("Execution failed on sql '%s': %s" % (args[0], exc)) + ex = DatabaseError( + "Execution failed on sql '%s': %s" % (args[0], exc)) raise_with_traceback(ex) @staticmethod @@ -1557,7 +1571,7 @@ def _query_iterator(cursor, chunksize, columns, index_col=None, while True: data = cursor.fetchmany(chunksize) if type(data) == tuple: - data = list(data) + data = list(data) if not data: cursor.close() break @@ -1636,8 +1650,10 @@ def to_sql(self, frame, name, if_exists='fail', index=True, table.insert(chunksize) def has_table(self, name, schema=None): - escape = _SQL_GET_IDENTIFIER[self.flavor] - esc_name = escape(name) + # TODO(wesm): unused? + # escape = _SQL_GET_IDENTIFIER[self.flavor] + # esc_name = escape(name) + wld = _SQL_WILDCARD[self.flavor] flavor_map = { 'sqlite': ("SELECT name FROM sqlite_master " @@ -1645,7 +1661,7 @@ def has_table(self, name, schema=None): 'mysql': "SHOW TABLES LIKE %s" % wld} query = flavor_map.get(self.flavor) - return len(self.execute(query, [name,]).fetchall()) > 0 + return len(self.execute(query, [name, ]).fetchall()) > 0 def get_table(self, table_name, schema=None): return None # not supported in fallback mode diff --git a/pandas/io/stata.py b/pandas/io/stata.py index 806bd3df83843..8181e69abc60b 100644 --- a/pandas/io/stata.py +++ b/pandas/io/stata.py @@ -29,7 +29,9 @@ from pandas.lib import max_len_string_array, infer_dtype from pandas.tslib import NaT, Timestamp -_version_error = "Version of given Stata file is not 104, 105, 108, 113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), 117 (Stata 13), or 118 (Stata 14)" +_version_error = ("Version of given Stata file is not 104, 105, 108, " + "113 (Stata 8/9), 114 (Stata 10/11), 115 (Stata 12), " + "117 (Stata 13), or 118 (Stata 14)") _statafile_processing_params1 = """\ convert_dates : boolean, defaults to True @@ -245,11 +247,12 @@ def convert_year_days_safe(year, days): datetime or datetime64 Series """ if year.max() < (MAX_YEAR - 1) and year.min() > MIN_YEAR: - return to_datetime(year, format='%Y') + to_timedelta(days, unit='d') + return (to_datetime(year, format='%Y') + + to_timedelta(days, unit='d')) else: index = getattr(year, 'index', None) - value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) for - y, d in zip(year, days)] + value = [datetime.datetime(y, 1, 1) + relativedelta(days=int(d)) + for y, d in zip(year, days)] return Series(value, index=index) def convert_delta_safe(base, deltas, unit): @@ -265,8 +268,8 @@ def convert_delta_safe(base, deltas, unit): return Series(values, index=index) elif unit == 'ms': if deltas.max() > MAX_MS_DELTA or deltas.min() < MIN_MS_DELTA: - values = [base + relativedelta(microseconds=(int(d) * 1000)) for - d in deltas] + values = [base + relativedelta(microseconds=(int(d) * 1000)) + for d in deltas] return Series(values, index=index) else: raise ValueError('format not understood') @@ -274,7 +277,8 @@ def convert_delta_safe(base, deltas, unit): deltas = to_timedelta(deltas, unit=unit) return base + deltas - # TODO: If/when pandas supports more than datetime64[ns], this should be improved to use correct range, e.g. datetime[Y] for yearly + # TODO: If/when pandas supports more than datetime64[ns], this should be + # improved to use correct range, e.g. datetime[Y] for yearly bad_locs = np.isnan(dates) has_bad_values = False if bad_locs.any(): @@ -426,8 +430,8 @@ def parse_dates_safe(dates, delta=False, year=False, days=False): excessive_string_length_error = """ -Fixed width strings in Stata .dta files are limited to 244 (or fewer) characters. -Column '%s' does not satisfy this restriction. +Fixed width strings in Stata .dta files are limited to 244 (or fewer) +characters. Column '%s' does not satisfy this restriction. """ @@ -462,8 +466,8 @@ class InvalidColumnName(Warning): {0} If this is not what you expect, please make sure you have Stata-compliant -column names in your DataFrame (strings only, max 32 characters, only alphanumerics and -underscores, no Stata reserved words) +column names in your DataFrame (strings only, max 32 characters, only +alphanumerics and underscores, no Stata reserved words) """ @@ -481,17 +485,16 @@ def _cast_to_stata_types(data): ----- Numeric columns in Stata must be one of int8, int16, int32, float32 or float64, with some additional value restrictions. int8 and int16 columns - are checked for violations of the value restrictions and - upcast if needed. int64 data is not usable in Stata, and so it is - downcast to int32 whenever the value are in the int32 range, and - sidecast to float64 when larger than this range. If the int64 values - are outside of the range of those perfectly representable as float64 values, - a warning is raised. - - bool columns are cast to int8. uint colums are converted to int of the same - size if there is no loss in precision, other wise are upcast to a larger - type. uint64 is currently not supported since it is concerted to object in - a DataFrame. + are checked for violations of the value restrictions and upcast if needed. + int64 data is not usable in Stata, and so it is downcast to int32 whenever + the value are in the int32 range, and sidecast to float64 when larger than + this range. If the int64 values are outside of the range of those + perfectly representable as float64 values, a warning is raised. + + bool columns are cast to int8. uint colums are converted to int of the + same size if there is no loss in precision, other wise are upcast to a + larger type. uint64 is currently not supported since it is concerted to + object in a DataFrame. """ ws = '' # original, if small, if large @@ -510,8 +513,8 @@ def _cast_to_stata_types(data): else: dtype = c_data[2] if c_data[2] == np.float64: # Warn if necessary - if data[col].max() >= 2 ** 53: - ws = precision_loss_doc % ('uint64', 'float64') + if data[col].max() >= 2 ** 53: + ws = precision_loss_doc % ('uint64', 'float64') data[col] = data[col].astype(dtype) @@ -523,7 +526,8 @@ def _cast_to_stata_types(data): if data[col].max() > 32740 or data[col].min() < -32767: data[col] = data[col].astype(np.int32) elif dtype == np.int64: - if data[col].max() <= 2147483620 and data[col].min() >= -2147483647: + if (data[col].max() <= 2147483620 and + data[col].min() >= -2147483647): data[col] = data[col].astype(np.int32) else: data[col] = data[col].astype(np.float64) @@ -723,7 +727,8 @@ class StataMissingValue(StringMixin): MISSING_VALUES[value] = '.' if i > 0: MISSING_VALUES[value] += chr(96 + i) - int_value = struct.unpack('' or '<' + self.byteorder = struct.unpack('b', self.path_or_buf.read(1))[ + 0] == 0x1 and '>' or '<' self.filetype = struct.unpack('b', self.path_or_buf.read(1))[0] self.path_or_buf.read(1) # unused @@ -1250,8 +1258,8 @@ def _read_old_header(self, first_char): self.data_location = self.path_or_buf.tell() def _calcsize(self, fmt): - return (type(fmt) is int and fmt - or struct.calcsize(self.byteorder + fmt)) + return (type(fmt) is int and fmt or + struct.calcsize(self.byteorder + fmt)) def _decode(self, s): s = s.partition(b"\0")[0] @@ -1335,7 +1343,8 @@ def _read_strls(self): break if self.format_version == 117: - v_o = struct.unpack(self.byteorder + 'Q', self.path_or_buf.read(8))[0] + v_o = struct.unpack(self.byteorder + 'Q', + self.path_or_buf.read(8))[0] else: buf = self.path_or_buf.read(12) # Only tested on little endian file on little endian machine. @@ -1435,7 +1444,8 @@ def read(self, nrows=None, convert_dates=None, dtype = [] # Convert struct data types to numpy data type for i, typ in enumerate(self.typlist): if typ in self.NUMPY_TYPE_MAP: - dtype.append(('s' + str(i), self.byteorder + self.NUMPY_TYPE_MAP[typ])) + dtype.append(('s' + str(i), self.byteorder + + self.NUMPY_TYPE_MAP[typ])) else: dtype.append(('s' + str(i), 'S' + str(typ))) dtype = np.dtype(dtype) @@ -1487,7 +1497,8 @@ def read(self, nrows=None, convert_dates=None, # Decode strings for col, typ in zip(data, self.typlist): if type(typ) is int: - data[col] = data[col].apply(self._null_terminate, convert_dtype=True) + data[col] = data[col].apply( + self._null_terminate, convert_dtype=True) data = self._insert_strls(data) @@ -1503,7 +1514,8 @@ def read(self, nrows=None, convert_dates=None, dtype = data[col].dtype if (dtype != np.dtype(object)) and (dtype != self.dtyplist[i]): requires_type_conversion = True - data_formatted.append((col, Series(data[col], index, self.dtyplist[i]))) + data_formatted.append( + (col, Series(data[col], index, self.dtyplist[i]))) else: data_formatted.append((col, data[col])) if requires_type_conversion: @@ -1771,8 +1783,8 @@ def _dtype_to_default_stata_fmt(dtype, column): # TODO: expand this to handle a default datetime format? if dtype.type == np.object_: inferred_dtype = infer_dtype(column.dropna()) - if not (inferred_dtype in ('string', 'unicode') - or len(column) == 0): + if not (inferred_dtype in ('string', 'unicode') or + len(column) == 0): raise ValueError('Writing general object arrays is not supported') itemsize = max_len_string_array(com._ensure_object(column.values)) if itemsize > 244: @@ -1836,6 +1848,7 @@ class StataWriter(StataParser): >>> writer = StataWriter('./date_data_file.dta', data, {'date' : 'tw'}) >>> writer.write_file() """ + def __init__(self, fname, data, convert_dates=None, write_index=True, encoding="latin-1", byteorder=None, time_stamp=None, data_label=None): @@ -1883,8 +1896,8 @@ def _prepare_categoricals(self, data): self._value_labels.append(StataValueLabel(data[col])) dtype = data[col].cat.codes.dtype if dtype == np.int64: - raise ValueError('It is not possible to export int64-based ' - 'categorical data to Stata.') + raise ValueError('It is not possible to export ' + 'int64-based categorical data to Stata.') values = data[col].cat.codes.values.copy() # Upcast if needed so that correct missing values can be set @@ -1921,16 +1934,17 @@ def _replace_nans(self, data): return data def _check_column_names(self, data): - """Checks column names to ensure that they are valid Stata column names. + """ + Checks column names to ensure that they are valid Stata column names. This includes checks for: * Non-string names * Stata keywords * Variables that start with numbers * Variables with names that are too long - When an illegal variable name is detected, it is converted, and if dates - are exported, the variable name is propogated to the date conversion - dictionary + When an illegal variable name is detected, it is converted, and if + dates are exported, the variable name is propogated to the date + conversion dictionary """ converted_names = [] columns = list(data.columns) @@ -1970,7 +1984,8 @@ def _check_column_names(self, data): orig_name = orig_name.encode('utf-8') except: pass - converted_names.append('{0} -> {1}'.format(orig_name, name)) + converted_names.append( + '{0} -> {1}'.format(orig_name, name)) columns[j] = name diff --git a/pandas/io/tests/generate_legacy_storage_files.py b/pandas/io/tests/generate_legacy_storage_files.py index 91d0333b3407f..f556c980bb80c 100644 --- a/pandas/io/tests/generate_legacy_storage_files.py +++ b/pandas/io/tests/generate_legacy_storage_files.py @@ -2,15 +2,14 @@ from __future__ import print_function from distutils.version import LooseVersion from pandas import (Series, DataFrame, Panel, - SparseSeries, SparseDataFrame, SparsePanel, - Index, MultiIndex, PeriodIndex, bdate_range, to_msgpack, - date_range, period_range, bdate_range, Timestamp, Categorical, - Period) + SparseSeries, SparseDataFrame, + Index, MultiIndex, bdate_range, to_msgpack, + date_range, period_range, + Timestamp, Categorical, Period) import os import sys import numpy as np import pandas -import pandas.util.testing as tm import platform as pl @@ -66,50 +65,68 @@ def create_data(): scalars = dict(timestamp=Timestamp('20130101')) if LooseVersion(pandas.__version__) >= '0.17.0': - scalars['period'] = Period('2012','M') + scalars['period'] = Period('2012', 'M') index = dict(int=Index(np.arange(10)), date=date_range('20130101', periods=10), period=period_range('2013-01-01', freq='M', periods=10)) - mi = dict(reg2=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', 'foo', 'qux', 'qux'], - ['one', 'two', 'one', 'two', 'one', 'two', 'one', 'two']])), - names=['first', 'second'])) + mi = dict(reg2=MultiIndex.from_tuples( + tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'foo', + 'foo', 'qux', 'qux'], + ['one', 'two', 'one', 'two', 'one', + 'two', 'one', 'two']])), + names=['first', 'second'])) series = dict(float=Series(data['A']), int=Series(data['B']), mixed=Series(data['E']), - ts=Series(np.arange(10).astype(np.int64), index=date_range('20130101',periods=10)), + ts=Series(np.arange(10).astype(np.int64), + index=date_range('20130101', periods=10)), mi=Series(np.arange(5).astype(np.float64), - index=MultiIndex.from_tuples(tuple(zip(*[[1, 1, 2, 2, 2], [3, 4, 3, 4, 5]])), - names=['one', 'two'])), - dup=Series(np.arange(5).astype(np.float64), index=['A', 'B', 'C', 'D', 'A']), + index=MultiIndex.from_tuples( + tuple(zip(*[[1, 1, 2, 2, 2], + [3, 4, 3, 4, 5]])), + names=['one', 'two'])), + dup=Series(np.arange(5).astype(np.float64), + index=['A', 'B', 'C', 'D', 'A']), cat=Series(Categorical(['foo', 'bar', 'baz'])), - dt=Series(date_range('20130101',periods=5)), - dt_tz=Series(date_range('20130101',periods=5,tz='US/Eastern'))) + dt=Series(date_range('20130101', periods=5)), + dt_tz=Series(date_range('20130101', periods=5, + tz='US/Eastern'))) if LooseVersion(pandas.__version__) >= '0.17.0': series['period'] = Series([Period('2000Q1')] * 5) mixed_dup_df = DataFrame(data) mixed_dup_df.columns = list("ABCDA") - frame = dict(float=DataFrame(dict(A=series['float'], B=series['float'] + 1)), + frame = dict(float=DataFrame(dict(A=series['float'], + B=series['float'] + 1)), int=DataFrame(dict(A=series['int'], B=series['int'] + 1)), - mixed=DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']])), - mi=DataFrame(dict(A=np.arange(5).astype(np.float64), B=np.arange(5).astype(np.int64)), - index=MultiIndex.from_tuples(tuple(zip(*[['bar', 'bar', 'baz', 'baz', 'baz'], - ['one', 'two', 'one', 'two', 'three']])), - names=['first', 'second'])), + mixed=DataFrame(dict([(k, data[k]) + for k in ['A', 'B', 'C', 'D']])), + mi=DataFrame(dict(A=np.arange(5).astype(np.float64), + B=np.arange(5).astype(np.int64)), + index=MultiIndex.from_tuples( + tuple(zip(*[['bar', 'bar', 'baz', + 'baz', 'baz'], + ['one', 'two', 'one', + 'two', 'three']])), + names=['first', 'second'])), dup=DataFrame(np.arange(15).reshape(5, 3).astype(np.float64), columns=['A', 'B', 'A']), cat_onecol=DataFrame(dict(A=Categorical(['foo', 'bar']))), - cat_and_float=DataFrame(dict(A=Categorical(['foo', 'bar', 'baz']), - B=np.arange(3).astype(np.int64))), + cat_and_float=DataFrame(dict( + A=Categorical(['foo', 'bar', 'baz']), + B=np.arange(3).astype(np.int64))), mixed_dup=mixed_dup_df, - dt_mixed_tzs=DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)), + dt_mixed_tzs=DataFrame(dict( + A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130603', tz='CET')), index=range(5)), ) mixed_dup_panel = Panel(dict(ItemA=frame['float'], ItemB=frame['int'])) mixed_dup_panel.items = ['ItemA', 'ItemA'] - panel = dict(float=Panel(dict(ItemA=frame['float'], ItemB=frame['float'] + 1)), + panel = dict(float=Panel(dict(ItemA=frame['float'], + ItemB=frame['float'] + 1)), dup=Panel(np.arange(30).reshape(3, 5, 2).astype(np.float64), items=['A', 'B', 'A']), mixed_dup=mixed_dup_panel) @@ -153,20 +170,22 @@ def create_msgpack_data(): def platform_name(): - return '_'.join([str(pandas.__version__), str(pl.machine()), str(pl.system().lower()), str(pl.python_version())]) + return '_'.join([str(pandas.__version__), str(pl.machine()), + str(pl.system().lower()), str(pl.python_version())]) def write_legacy_pickles(output_dir): # make sure we are < 0.13 compat (in py3) try: - from pandas.compat import zip, cPickle as pickle + from pandas.compat import zip, cPickle as pickle # noqa except: import pickle version = pandas.__version__ - print("This script generates a storage file for the current arch, system, and python version") + print("This script generates a storage file for the current arch, system, " + "and python version") print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: pickle") @@ -184,7 +203,8 @@ def write_legacy_msgpack(output_dir): version = pandas.__version__ - print("This script generates a storage file for the current arch, system, and python version") + print("This script generates a storage file for the current arch, " + "system, and python version") print(" pandas version: {0}".format(version)) print(" output dir : {0}".format(output_dir)) print(" storage format: msgpack") @@ -200,7 +220,8 @@ def write_legacy_file(): sys.path.insert(0, '.') if len(sys.argv) != 3: - exit("Specify output directory and storage type: generate_legacy_storage_files.py ") + exit("Specify output directory and storage type: generate_legacy_" + "storage_files.py ") output_dir = str(sys.argv[1]) storage_type = str(sys.argv[2]) diff --git a/pandas/io/tests/test_clipboard.py b/pandas/io/tests/test_clipboard.py index a056bac293cfa..a7da27a2f75dd 100644 --- a/pandas/io/tests/test_clipboard.py +++ b/pandas/io/tests/test_clipboard.py @@ -13,13 +13,14 @@ try: - import pandas.util.clipboard + import pandas.util.clipboard # noqa except OSError: raise nose.SkipTest("no clipboard found") @disabled class TestClipboard(tm.TestCase): + @classmethod def setUpClass(cls): super(TestClipboard, cls).setUpClass() @@ -40,11 +41,12 @@ def setUpClass(cls): # Test columns exceeding "max_colwidth" (GH8305) _cw = get_option('display.max_colwidth') + 1 cls.data['colwidth'] = mkdf(5, 3, data_gen_f=lambda *args: 'x' * _cw, - c_idx_type='s', r_idx_type='i', - c_idx_names=[None], r_idx_names=[None]) + c_idx_type='s', r_idx_type='i', + c_idx_names=[None], r_idx_names=[None]) # Test GH-5346 max_rows = get_option('display.max_rows') - cls.data['longdf'] = mkdf(max_rows+1, 3, data_gen_f=lambda *args: randint(2), + cls.data['longdf'] = mkdf(max_rows + 1, 3, + data_gen_f=lambda *args: randint(2), c_idx_type='s', r_idx_type='i', c_idx_names=[None], r_idx_names=[None]) # Test for non-ascii text: GH9263 @@ -61,18 +63,18 @@ def check_round_trip_frame(self, data_type, excel=None, sep=None): data = self.data[data_type] data.to_clipboard(excel=excel, sep=sep) if sep is not None: - result = read_clipboard(sep=sep,index_col=0) + result = read_clipboard(sep=sep, index_col=0) else: result = read_clipboard() tm.assert_frame_equal(data, result, check_dtype=False) def test_round_trip_frame_sep(self): for dt in self.data_types: - self.check_round_trip_frame(dt,sep=',') + self.check_round_trip_frame(dt, sep=',') def test_round_trip_frame_string(self): for dt in self.data_types: - self.check_round_trip_frame(dt,excel=False) + self.check_round_trip_frame(dt, excel=False) def test_round_trip_frame(self): for dt in self.data_types: diff --git a/pandas/io/tests/test_common.py b/pandas/io/tests/test_common.py index 73cae1130c740..55fe3f3357c05 100644 --- a/pandas/io/tests/test_common.py +++ b/pandas/io/tests/test_common.py @@ -5,7 +5,6 @@ import os from os.path import isabs -import nose import pandas.util.testing as tm from pandas.io import common @@ -20,6 +19,7 @@ except ImportError: pass + class TestCommonIOCapabilities(tm.TestCase): def test_expand_user(self): diff --git a/pandas/io/tests/test_cparser.py b/pandas/io/tests/test_cparser.py index ceb845073e2c3..52cb56bea1122 100644 --- a/pandas/io/tests/test_cparser.py +++ b/pandas/io/tests/test_cparser.py @@ -3,27 +3,18 @@ """ from pandas.compat import StringIO, BytesIO, map -from datetime import datetime from pandas import compat -import csv import os import sys -import re import nose from numpy import nan import numpy as np -from pandas import DataFrame, Series, Index, isnull, MultiIndex -import pandas.io.parsers as parsers -from pandas.io.parsers import (read_csv, read_table, read_fwf, - TextParser, TextFileReader) -from pandas.util.testing import (assert_almost_equal, assert_frame_equal, - assert_series_equal, network) -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp +from pandas import DataFrame +from pandas.io.parsers import (read_csv, TextFileReader) +from pandas.util.testing import assert_frame_equal import pandas.util.testing as tm @@ -43,19 +34,19 @@ def test_file_handle(self): try: f = open(self.csv1, 'rb') reader = TextReader(f) - result = reader.read() + result = reader.read() # noqa finally: f.close() def test_string_filename(self): reader = TextReader(self.csv1, header=None) - result = reader.read() + reader.read() def test_file_handle_mmap(self): try: f = open(self.csv1, 'rb') reader = TextReader(f, memory_map=True, header=None) - result = reader.read() + reader.read() finally: f.close() @@ -63,7 +54,7 @@ def test_StringIO(self): text = open(self.csv1, 'rb').read() src = BytesIO(text) reader = TextReader(src, header=None) - result = reader.read() + reader.read() def test_string_factorize(self): # should this be optional? @@ -136,7 +127,7 @@ def test_integer_thousands_alt(self): data = '123.456\n12.500' reader = TextFileReader(StringIO(data), delimiter=':', - thousands='.', header=None) + thousands='.', header=None) result = reader.read() expected = [123456, 12500] @@ -192,7 +183,7 @@ def test_header_not_enough_lines(self): self.assertEqual(header, expected) recs = reader.read() - expected = {0 : [1, 4], 1 : [2, 5], 2 : [3, 6]} + expected = {0: [1, 4], 1: [2, 5], 2: [3, 6]} assert_array_dicts_equal(expected, recs) # not enough rows @@ -202,7 +193,8 @@ def test_header_not_enough_lines(self): def test_header_not_enough_lines_as_recarray(self): if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64, only when all tests are run") + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") data = ('skip this\n' 'skip this\n' @@ -279,7 +271,8 @@ def test_numpy_string_dtype_as_recarray(self): aaaaa,5""" if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64, only when all tests are run") + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") def _make_reader(**kwds): return TextReader(StringIO(data), delimiter=',', header=None, @@ -382,15 +375,15 @@ def test_empty_field_eof(self): index=[1, 1]) c = DataFrame([[1, 2, 3, 4], [6, nan, nan, nan], [8, 9, 10, 11], [13, 14, nan, nan]], - columns=list('abcd'), - index=[0, 5, 7, 12]) + columns=list('abcd'), + index=[0, 5, 7, 12]) for _ in range(100): df = read_csv(StringIO('a,b\nc\n'), skiprows=0, names=['a'], engine='c') assert_frame_equal(df, a) - df = read_csv(StringIO('1,1,1,1,0\n'*2 + '\n'*2), + df = read_csv(StringIO('1,1,1,1,0\n' * 2 + '\n' * 2), names=list("abcd"), engine='c') assert_frame_equal(df, b) @@ -398,6 +391,7 @@ def test_empty_field_eof(self): names=list('abcd'), engine='c') assert_frame_equal(df, c) + def assert_array_dicts_equal(left, right): for k, v in compat.iteritems(left): assert(np.array_equal(v, right[k])) diff --git a/pandas/io/tests/test_data.py b/pandas/io/tests/test_data.py index afc61dc42f569..ee4dd079ccb0a 100644 --- a/pandas/io/tests/test_data.py +++ b/pandas/io/tests/test_data.py @@ -1,3 +1,5 @@ +# flake8: noqa + from __future__ import print_function from pandas import compat import warnings diff --git a/pandas/io/tests/test_date_converters.py b/pandas/io/tests/test_date_converters.py index 2b23556706f0c..3855dc485ed83 100644 --- a/pandas/io/tests/test_date_converters.py +++ b/pandas/io/tests/test_date_converters.py @@ -1,28 +1,17 @@ -from pandas.compat import StringIO, BytesIO +from pandas.compat import StringIO from datetime import date, datetime -import csv -import os -import sys -import re import nose -from numpy import nan import numpy as np -from numpy.testing.decorators import slow - -from pandas import DataFrame, Series, Index, MultiIndex, isnull -import pandas.io.parsers as parsers -from pandas.io.parsers import (read_csv, read_table, read_fwf, - TextParser) -from pandas.util.testing import (assert_almost_equal, assert_frame_equal, - assert_series_equal, network) -import pandas.lib as lib -from pandas import compat -from pandas.lib import Timestamp + +from pandas import DataFrame, MultiIndex +from pandas.io.parsers import (read_csv, read_table) +from pandas.util.testing import assert_frame_equal import pandas.io.date_converters as conv import pandas.util.testing as tm + class TestConverters(tm.TestCase): def setUp(self): @@ -68,7 +57,8 @@ def test_parse_date_fields(self): expected = np.array([datetime(2007, 1, 3), datetime(2008, 2, 4)]) self.assertTrue((result == expected).all()) - data = "year, month, day, a\n 2001 , 01 , 10 , 10.\n 2001 , 02 , 1 , 11." + data = ("year, month, day, a\n 2001 , 01 , 10 , 10.\n" + "2001 , 02 , 1 , 11.") datecols = {'ymd': [0, 1, 2]} df = read_table(StringIO(data), sep=',', header=0, parse_dates=datecols, @@ -136,8 +126,9 @@ def date_parser(date, time): parse_dates={'datetime': ['date', 'time']}, index_col=['datetime', 'prn']) - datetimes = np.array(['2013-11-03T19:00:00Z']*3, dtype='datetime64[s]') - df_correct = DataFrame(data={'rxstatus': ['00E80000']*3}, + datetimes = np.array(['2013-11-03T19:00:00Z'] * 3, + dtype='datetime64[s]') + df_correct = DataFrame(data={'rxstatus': ['00E80000'] * 3}, index=MultiIndex.from_tuples( [(datetimes[0], 126), (datetimes[1], 23), @@ -146,6 +137,5 @@ def date_parser(date, time): assert_frame_equal(df, df_correct) if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/test_excel.py b/pandas/io/tests/test_excel.py index 8023c25cdd660..082a26df681a4 100644 --- a/pandas/io/tests/test_excel.py +++ b/pandas/io/tests/test_excel.py @@ -82,6 +82,7 @@ def _skip_if_no_boto(): class SharedItems(object): + def setUp(self): self.dirpath = tm.get_data_path() self.frame = _frame.copy() @@ -233,13 +234,13 @@ def test_excel_passes_na(self): excel = self.get_excelfile('test4') parsed = read_excel(excel, 'Sheet1', keep_default_na=False, - na_values=['apple']) + na_values=['apple']) expected = DataFrame([['NA'], [1], ['NA'], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) parsed = read_excel(excel, 'Sheet1', keep_default_na=True, - na_values=['apple']) + na_values=['apple']) expected = DataFrame([[np.nan], [1], [np.nan], [np.nan], ['rabbit']], columns=['Test']) tm.assert_frame_equal(parsed, expected) @@ -325,7 +326,8 @@ def test_reader_special_dtypes(self): # convert_float and converters should be different but both accepted expected["StrCol"] = expected["StrCol"].apply(str) - actual = self.get_exceldf(basename, 'Sheet1', converters={"StrCol": str}) + actual = self.get_exceldf( + basename, 'Sheet1', converters={"StrCol": str}) tm.assert_frame_equal(actual, expected) no_convert_float = float_expected.copy() @@ -352,7 +354,8 @@ def test_reader_converters(self): 3: lambda x: str(x) if x else '', } - # should read in correctly and set types of single cells (not array dtypes) + # should read in correctly and set types of single cells (not array + # dtypes) actual = self.get_exceldf(basename, 'Sheet1', converters=converters) tm.assert_frame_equal(actual, expected) @@ -490,21 +493,21 @@ def test_creating_and_reading_multiple_sheets(self): _skip_if_no_openpyxl() def tdf(sheetname): - d, i = [11,22,33], [1,2,3] - return DataFrame(d,i,columns=[sheetname]) + d, i = [11, 22, 33], [1, 2, 3] + return DataFrame(d, i, columns=[sheetname]) - sheets = ['AAA','BBB','CCC'] + sheets = ['AAA', 'BBB', 'CCC'] dfs = [tdf(s) for s in sheets] - dfs = dict(zip(sheets,dfs)) + dfs = dict(zip(sheets, dfs)) with ensure_clean(self.ext) as pth: with ExcelWriter(pth) as ew: for sheetname, df in iteritems(dfs): - df.to_excel(ew,sheetname) - dfs_returned = read_excel(pth,sheetname=sheets) + df.to_excel(ew, sheetname) + dfs_returned = read_excel(pth, sheetname=sheets) for s in sheets: - tm.assert_frame_equal(dfs[s],dfs_returned[s]) + tm.assert_frame_equal(dfs[s], dfs_returned[s]) def test_reader_seconds(self): # Test reading times with and without milliseconds. GH5945. @@ -546,133 +549,152 @@ def test_reader_seconds(self): tm.assert_frame_equal(actual, expected) def test_read_excel_multiindex(self): - #GH 4679 - mi = MultiIndex.from_product([['foo','bar'],['a','b']]) + # GH 4679 + mi = MultiIndex.from_product([['foo', 'bar'], ['a', 'b']]) mi_file = os.path.join(self.dirpath, 'testmultiindex' + self.ext) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], - [2, 3.5, pd.Timestamp('2015-01-02'), False], - [3, 4.5, pd.Timestamp('2015-01-03'), False], - [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns = mi) + [2, 3.5, pd.Timestamp('2015-01-02'), False], + [3, 4.5, pd.Timestamp('2015-01-03'), False], + [4, 5.5, pd.Timestamp('2015-01-04'), True]], + columns=mi) - actual = read_excel(mi_file, 'mi_column', header=[0,1]) + actual = read_excel(mi_file, 'mi_column', header=[0, 1]) tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'mi_column', header=[0,1], index_col=0) + actual = read_excel(mi_file, 'mi_column', header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) expected.columns = ['a', 'b', 'c', 'd'] expected.index = mi - actual = read_excel(mi_file, 'mi_index', index_col=[0,1]) + actual = read_excel(mi_file, 'mi_index', index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) expected.columns = mi - actual = read_excel(mi_file, 'both', index_col=[0,1], header=[0,1]) + actual = read_excel(mi_file, 'both', index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) expected.index = mi.set_names(['ilvl1', 'ilvl2']) expected.columns = ['a', 'b', 'c', 'd'] - actual = read_excel(mi_file, 'mi_index_name', index_col=[0,1]) + actual = read_excel(mi_file, 'mi_index_name', index_col=[0, 1]) tm.assert_frame_equal(actual, expected) expected.index = list(range(4)) expected.columns = mi.set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'mi_column_name', header=[0,1], index_col=0) + actual = read_excel(mi_file, 'mi_column_name', + header=[0, 1], index_col=0) tm.assert_frame_equal(actual, expected) # Issue #11317 - expected.columns = mi.set_levels([1,2],level=1).set_names(['c1', 'c2']) - actual = read_excel(mi_file, 'name_with_int', index_col=0, header=[0,1]) + expected.columns = mi.set_levels( + [1, 2], level=1).set_names(['c1', 'c2']) + actual = read_excel(mi_file, 'name_with_int', + index_col=0, header=[0, 1]) tm.assert_frame_equal(actual, expected) expected.columns = mi.set_names(['c1', 'c2']) expected.index = mi.set_names(['ilvl1', 'ilvl2']) - actual = read_excel(mi_file, 'both_name', index_col=[0,1], header=[0,1]) + actual = read_excel(mi_file, 'both_name', + index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'both_name', index_col=[0,1], header=[0,1]) + actual = read_excel(mi_file, 'both_name', + index_col=[0, 1], header=[0, 1]) tm.assert_frame_equal(actual, expected) - actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0,1], - header=[0,1], skiprows=2) + actual = read_excel(mi_file, 'both_name_skiprows', index_col=[0, 1], + header=[0, 1], skiprows=2) tm.assert_frame_equal(actual, expected) - def test_excel_multindex_roundtrip(self): - #GH 4679 + # GH 4679 _skip_if_no_xlsxwriter() with ensure_clean('.xlsx') as pth: for c_idx_names in [True, False]: for r_idx_names in [True, False]: for c_idx_levels in [1, 3]: for r_idx_levels in [1, 3]: - # column index name can't be serialized unless MultiIndex + # column index name can't be serialized unless + # MultiIndex if (c_idx_levels == 1 and c_idx_names): continue - # empty name case current read in as unamed levels, not Nones + # empty name case current read in as unamed levels, + # not Nones check_names = True if not r_idx_names and r_idx_levels > 1: check_names = False df = mkdf(5, 5, c_idx_names, - r_idx_names, c_idx_levels, - r_idx_levels) + r_idx_names, c_idx_levels, + r_idx_levels) df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) + act = pd.read_excel( + pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal( + df, act, check_names=check_names) df.iloc[0, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) + act = pd.read_excel( + pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal( + df, act, check_names=check_names) df.iloc[-1, :] = np.nan df.to_excel(pth) - act = pd.read_excel(pth, index_col=list(range(r_idx_levels)), - header=list(range(c_idx_levels))) - tm.assert_frame_equal(df, act, check_names=check_names) + act = pd.read_excel( + pth, index_col=list(range(r_idx_levels)), + header=list(range(c_idx_levels))) + tm.assert_frame_equal( + df, act, check_names=check_names) def test_excel_oldindex_format(self): - #GH 4679 + # GH 4679 data = np.array([['R0C0', 'R0C1', 'R0C2', 'R0C3', 'R0C4'], ['R1C0', 'R1C1', 'R1C2', 'R1C3', 'R1C4'], ['R2C0', 'R2C1', 'R2C2', 'R2C3', 'R2C4'], ['R3C0', 'R3C1', 'R3C2', 'R3C3', 'R3C4'], ['R4C0', 'R4C1', 'R4C2', 'R4C3', 'R4C4']]) columns = ['C_l0_g0', 'C_l0_g1', 'C_l0_g2', 'C_l0_g3', 'C_l0_g4'] - mi = MultiIndex(levels=[['R_l0_g0', 'R_l0_g1', 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], - ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', 'R_l1_g3', 'R_l1_g4']], + mi = MultiIndex(levels=[['R_l0_g0', 'R_l0_g1', 'R_l0_g2', + 'R_l0_g3', 'R_l0_g4'], + ['R_l1_g0', 'R_l1_g1', 'R_l1_g2', + 'R_l1_g3', 'R_l1_g4']], labels=[[0, 1, 2, 3, 4], [0, 1, 2, 3, 4]], names=['R0', 'R1']) - si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', 'R_l0_g3', 'R_l0_g4'], name='R0') + si = Index(['R_l0_g0', 'R_l0_g1', 'R_l0_g2', + 'R_l0_g3', 'R_l0_g4'], name='R0') - in_file = os.path.join(self.dirpath, 'test_index_name_pre17' + self.ext) + in_file = os.path.join( + self.dirpath, 'test_index_name_pre17' + self.ext) expected = pd.DataFrame(data, index=si, columns=columns) with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'single_names', has_index_names=True) + actual = pd.read_excel( + in_file, 'single_names', has_index_names=True) tm.assert_frame_equal(actual, expected) expected.index.name = None actual = pd.read_excel(in_file, 'single_no_names') tm.assert_frame_equal(actual, expected) with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'single_no_names', has_index_names=False) + actual = pd.read_excel( + in_file, 'single_no_names', has_index_names=False) tm.assert_frame_equal(actual, expected) expected.index = mi with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'multi_names', has_index_names=True) + actual = pd.read_excel( + in_file, 'multi_names', has_index_names=True) tm.assert_frame_equal(actual, expected) expected.index.names = [None, None] - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0,1]) + actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1]) tm.assert_frame_equal(actual, expected, check_names=False) with tm.assert_produces_warning(FutureWarning): - actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0,1], + actual = pd.read_excel(in_file, 'multi_no_names', index_col=[0, 1], has_index_names=False) tm.assert_frame_equal(actual, expected, check_names=False) @@ -684,7 +706,7 @@ def test_read_excel_bool_header_arg(self): header=arg) def test_read_excel_chunksize(self): - #GH 8011 + # GH 8011 with tm.assertRaises(NotImplementedError): pd.read_excel(os.path.join(self.dirpath, 'test1' + self.ext), chunksize=100) @@ -703,20 +725,23 @@ def test_read_excel_date_parser(self): date_parser=dateparse) def test_read_excel_skiprows_list(self): - #GH 4903 - actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), - 'skiprows_list', skiprows=[0,2]) + # GH 4903 + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + self.ext), + 'skiprows_list', skiprows=[0, 2]) expected = DataFrame([[1, 2.5, pd.Timestamp('2015-01-01'), True], [2, 3.5, pd.Timestamp('2015-01-02'), False], [3, 4.5, pd.Timestamp('2015-01-03'), False], [4, 5.5, pd.Timestamp('2015-01-04'), True]], - columns = ['a','b','c','d']) + columns=['a', 'b', 'c', 'd']) tm.assert_frame_equal(actual, expected) - actual = pd.read_excel(os.path.join(self.dirpath, 'testskiprows' + self.ext), - 'skiprows_list', skiprows=np.array([0,2])) + actual = pd.read_excel(os.path.join(self.dirpath, + 'testskiprows' + self.ext), + 'skiprows_list', skiprows=np.array([0, 2])) tm.assert_frame_equal(actual, expected) + class XlsReaderTests(XlrdTests, tm.TestCase): ext = '.xls' engine_name = 'xlrd' @@ -735,8 +760,6 @@ class XlsmReaderTests(XlrdTests, tm.TestCase): check_skip = staticmethod(_skip_if_no_xlrd) - - class ExcelWriterBase(SharedItems): # Base class for test cases to run with different Excel writers. # To add a writer test, define the following: @@ -882,7 +905,8 @@ def test_int_types(self): float_frame = frame.astype(float) recons = read_excel(path, 'test1', convert_float=False) tm.assert_frame_equal(recons, float_frame, - check_index_type=False, check_column_type=False) + check_index_type=False, + check_column_type=False) def test_float_types(self): _skip_if_no_xlrd() @@ -982,8 +1006,8 @@ def test_roundtrip_indexlabels(self): merge_cells=self.merge_cells) reader = ExcelFile(path) recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + index_col=0, + ).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) @@ -994,8 +1018,8 @@ def test_roundtrip_indexlabels(self): merge_cells=self.merge_cells) reader = ExcelFile(path) recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + index_col=0, + ).astype(np.int64) frame.index.names = ['test'] self.assertEqual(frame.index.names, recons.index.names) @@ -1006,8 +1030,8 @@ def test_roundtrip_indexlabels(self): merge_cells=self.merge_cells) reader = ExcelFile(path) recons = read_excel(reader, 'test1', - index_col=0, - ).astype(np.int64) + index_col=0, + ).astype(np.int64) frame.index.names = ['test'] tm.assert_frame_equal(frame, recons.astype(bool)) @@ -1036,7 +1060,7 @@ def test_excel_roundtrip_indexname(self): xf = ExcelFile(path) result = read_excel(xf, xf.sheet_names[0], - index_col=0) + index_col=0) tm.assert_frame_equal(result, df) self.assertEqual(result.index.name, 'foo') @@ -1072,8 +1096,8 @@ def test_excel_date_datetime_format(self): with ensure_clean(self.ext) as filename2: writer1 = ExcelWriter(filename1) writer2 = ExcelWriter(filename2, - date_format='DD.MM.YYYY', - datetime_format='DD.MM.YYYY HH-MM-SS') + date_format='DD.MM.YYYY', + datetime_format='DD.MM.YYYY HH-MM-SS') df.to_excel(writer1, 'test1') df.to_excel(writer2, 'test1') @@ -1123,7 +1147,7 @@ def test_to_excel_multiindex(self): frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = read_excel(reader, 'test1', index_col=[0, 1], - parse_dates=False) + parse_dates=False) tm.assert_frame_equal(frame, df) # Test for Issue 11328. If column indices are integers, make @@ -1146,7 +1170,7 @@ def test_to_excel_multiindex_cols(self): header = 0 with ensure_clean(self.ext) as path: - # round trip + # round trip frame.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) df = read_excel(reader, 'test1', header=header, @@ -1155,7 +1179,7 @@ def test_to_excel_multiindex_cols(self): if not self.merge_cells: fm = frame.columns.format(sparsify=False, adjoin=False, names=False) - frame.columns = [ ".".join(map(str, q)) for q in zip(*fm) ] + frame.columns = [".".join(map(str, q)) for q in zip(*fm)] tm.assert_frame_equal(frame, df) def test_to_excel_multiindex_dates(self): @@ -1171,7 +1195,7 @@ def test_to_excel_multiindex_dates(self): tsframe.to_excel(path, 'test1', merge_cells=self.merge_cells) reader = ExcelFile(path) recons = read_excel(reader, 'test1', - index_col=[0, 1]) + index_col=[0, 1]) tm.assert_frame_equal(tsframe, recons) self.assertEqual(recons.index.names, ('time', 'foo')) @@ -1206,7 +1230,7 @@ def test_to_excel_float_format(self): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + index=['A', 'B'], columns=['X', 'Y', 'Z']) with ensure_clean(self.ext) as filename: df.to_excel(filename, 'test1', float_format='%.2f') @@ -1215,7 +1239,7 @@ def test_to_excel_float_format(self): rs = read_excel(reader, 'test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp) def test_to_excel_output_encoding(self): @@ -1226,7 +1250,8 @@ def test_to_excel_output_encoding(self): # avoid mixed inferred_type df = DataFrame([[u'\u0192', u'\u0193', u'\u0194'], [u'\u0195', u'\u0196', u'\u0197']], - index=[u'A\u0192', u'B'], columns=[u'X\u0193', u'Y', u'Z']) + index=[u'A\u0192', u'B'], + columns=[u'X\u0193', u'Y', u'Z']) with ensure_clean(filename) as filename: df.to_excel(filename, sheet_name='TestSheet', encoding='utf8') @@ -1245,7 +1270,7 @@ def test_to_excel_unicode_filename(self): df = DataFrame([[0.123456, 0.234567, 0.567567], [12.32112, 123123.2, 321321.2]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + index=['A', 'B'], columns=['X', 'Y', 'Z']) df.to_excel(filename, 'test1', float_format='%.2f') @@ -1253,7 +1278,7 @@ def test_to_excel_unicode_filename(self): rs = read_excel(reader, 'test1', index_col=None) xp = DataFrame([[0.12, 0.23, 0.57], [12.32, 123123.20, 321321.20]], - index=['A', 'B'], columns=['X', 'Y', 'Z']) + index=['A', 'B'], columns=['X', 'Y', 'Z']) tm.assert_frame_equal(rs, xp) # def test_to_excel_header_styling_xls(self): @@ -1370,7 +1395,8 @@ def test_excel_010_hemstring(self): def roundtrip(df, header=True, parser_hdr=0, index=True): with ensure_clean(self.ext) as path: - df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index) + df.to_excel(path, header=header, + merge_cells=self.merge_cells, index=index) xf = ExcelFile(path) res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) return res @@ -1382,9 +1408,9 @@ def roundtrip(df, header=True, parser_hdr=0, index=True): for j in range(1, 4): # col "" df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) - #this if will be removed once multi column excel writing - #is implemented for now fixing #9794 - if j>1: + # this if will be removed once multi column excel writing + # is implemented for now fixing #9794 + if j > 1: with tm.assertRaises(NotImplementedError): res = roundtrip(df, use_headers, index=False) else: @@ -1424,17 +1450,19 @@ def test_excel_010_hemstring_raises_NotImplementedError(self): def roundtrip2(df, header=True, parser_hdr=0, index=True): with ensure_clean(self.ext) as path: - df.to_excel(path, header=header, merge_cells=self.merge_cells, index=index) + df.to_excel(path, header=header, + merge_cells=self.merge_cells, index=index) xf = ExcelFile(path) res = read_excel(xf, xf.sheet_names[0], header=parser_hdr) return res - nrows = 5; ncols = 3 - j = 2; i = 1 + nrows = 5 + ncols = 3 + j = 2 + i = 1 df = mkdf(nrows, ncols, r_idx_nlevels=i, c_idx_nlevels=j) with tm.assertRaises(NotImplementedError): - res = roundtrip2(df, header=False, index=False) - + roundtrip2(df, header=False, index=False) def test_duplicated_columns(self): # Test for issue #5235 @@ -1452,11 +1480,11 @@ def test_duplicated_columns(self): tm.assert_frame_equal(write_frame, read_frame) # 11007 / #10970 - write_frame = DataFrame([[1,2,3,4],[5,6,7,8]], - columns=['A','B','A','B']) + write_frame = DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]], + columns=['A', 'B', 'A', 'B']) write_frame.to_excel(path, 'test1') read_frame = read_excel(path, 'test1') - read_frame.columns = ['A','B','A','B'] + read_frame.columns = ['A', 'B', 'A', 'B'] tm.assert_frame_equal(write_frame, read_frame) # 10982 @@ -1488,14 +1516,13 @@ def test_invalid_columns(self): 'B': [2, 2, 2]}) write_frame.to_excel(path, 'test1', columns=['B', 'C']) - expected = write_frame.loc[:, ['B','C']] + expected = write_frame.loc[:, ['B', 'C']] read_frame = read_excel(path, 'test1') tm.assert_frame_equal(expected, read_frame) with tm.assertRaises(KeyError): write_frame.to_excel(path, 'test1', columns=['C', 'D']) - def test_datetimes(self): # Test writing and reading datetimes. For issue #9139. (xref #9185) @@ -1557,7 +1584,8 @@ def wrapped(self, *args, **kwargs): if openpyxl_compat.is_compat(major_ver=major_ver): orig_method(self, *args, **kwargs) else: - msg = 'Installed openpyxl is not supported at this time\. Use.+' + msg = ('Installed openpyxl is not supported at this ' + 'time\. Use.+') with tm.assertRaisesRegexp(ValueError, msg): orig_method(self, *args, **kwargs) return wrapped @@ -1566,9 +1594,11 @@ def wrapped(self, *args, **kwargs): def raise_on_incompat_version(major_ver): def versioned_raise_on_incompat_version(cls): - methods = filter(operator.methodcaller('startswith', 'test_'), dir(cls)) + methods = filter(operator.methodcaller( + 'startswith', 'test_'), dir(cls)) for method in methods: - setattr(cls, method, raise_wrapper(major_ver)(getattr(cls, method))) + setattr(cls, method, raise_wrapper( + major_ver)(getattr(cls, method))) return cls return versioned_raise_on_incompat_version @@ -1617,12 +1647,14 @@ def setUpClass(cls): _skip_if_no_openpyxl() import openpyxl ver = openpyxl.__version__ - if not (LooseVersion(ver) >= LooseVersion('2.0.0') and LooseVersion(ver) < LooseVersion('2.2.0')): + if (not (LooseVersion(ver) >= LooseVersion('2.0.0') and + LooseVersion(ver) < LooseVersion('2.2.0'))): raise nose.SkipTest("openpyxl %s >= 2.2" % str(ver)) cls.setUpClass = setUpClass return cls + @raise_on_incompat_version(2) @skip_openpyxl_gt21 class Openpyxl20Tests(ExcelWriterBase, tm.TestCase): @@ -1678,7 +1710,7 @@ def test_to_excel_styleconverter(self): if ver >= LooseVersion('2.0.0') and ver < LooseVersion('2.1.0'): number_format = styles.NumberFormat(format_code='0.00') else: - number_format = '0.00' # XXX: Only works with openpyxl-2.1.0 + number_format = '0.00' # XXX: Only works with openpyxl-2.1.0 protection = styles.Protection(locked=True, hidden=False) @@ -1690,12 +1722,11 @@ def test_to_excel_styleconverter(self): self.assertEqual(kw['number_format'], number_format) self.assertEqual(kw['protection'], protection) - def test_write_cells_merge_styled(self): from pandas.core.format import ExcelCell from openpyxl import styles - sheet_name='merge_styled' + sheet_name = 'merge_styled' sty_b1 = {'font': {'color': '00FF0000'}} sty_a2 = {'font': {'color': '0000FF00'}} @@ -1705,12 +1736,12 @@ def test_write_cells_merge_styled(self): ExcelCell(col=0, row=1, val=99, style=sty_a2), ] - sty_merged = {'font': { 'color': '000000FF', 'bold': True }} + sty_merged = {'font': {'color': '000000FF', 'bold': True}} sty_kwargs = _Openpyxl20Writer._convert_to_style_kwargs(sty_merged) openpyxl_sty_merged = styles.Style(**sty_kwargs) merge_cells = [ ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), + mergestart=1, mergeend=1, style=sty_merged), ] with ensure_clean('.xlsx') as path: @@ -1724,6 +1755,7 @@ def test_write_cells_merge_styled(self): self.assertEqual(xcell_b1.style, openpyxl_sty_merged) self.assertEqual(xcell_a2.style, openpyxl_sty_merged) + def skip_openpyxl_lt22(cls): """Skip a TestCase instance if openpyxl < 2.2""" @@ -1738,6 +1770,7 @@ def setUpClass(cls): cls.setUpClass = setUpClass return cls + @raise_on_incompat_version(2) @skip_openpyxl_lt22 class Openpyxl22Tests(ExcelWriterBase, tm.TestCase): @@ -1746,7 +1779,6 @@ class Openpyxl22Tests(ExcelWriterBase, tm.TestCase): check_skip = staticmethod(lambda *args, **kwargs: None) def test_to_excel_styleconverter(self): - import openpyxl from openpyxl import styles hstyle = { @@ -1800,15 +1832,13 @@ def test_to_excel_styleconverter(self): self.assertEqual(kw['number_format'], number_format) self.assertEqual(kw['protection'], protection) - def test_write_cells_merge_styled(self): if not openpyxl_compat.is_compat(major_ver=2): raise nose.SkipTest('incompatiable openpyxl version') from pandas.core.format import ExcelCell - from openpyxl import styles - sheet_name='merge_styled' + sheet_name = 'merge_styled' sty_b1 = {'font': {'color': '00FF0000'}} sty_a2 = {'font': {'color': '0000FF00'}} @@ -1818,12 +1848,12 @@ def test_write_cells_merge_styled(self): ExcelCell(col=0, row=1, val=99, style=sty_a2), ] - sty_merged = {'font': { 'color': '000000FF', 'bold': True }} + sty_merged = {'font': {'color': '000000FF', 'bold': True}} sty_kwargs = _Openpyxl22Writer._convert_to_style_kwargs(sty_merged) openpyxl_sty_merged = sty_kwargs['font'] merge_cells = [ ExcelCell(col=0, row=0, val='pandas', - mergestart=1, mergeend=1, style=sty_merged), + mergestart=1, mergeend=1, style=sty_merged), ] with ensure_clean('.xlsx') as path: @@ -1847,8 +1877,8 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(self): _skip_if_no_xlwt() # MultiIndex as columns is not yet implemented 9794 cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + ('2014', 'height'), + ('2014', 'weight')]) df = DataFrame(np.random.randn(10, 3), columns=cols) with tm.assertRaises(NotImplementedError): with ensure_clean(self.ext) as path: @@ -1857,8 +1887,8 @@ def test_excel_raise_error_on_multiindex_columns_and_no_index(self): def test_excel_multiindex_columns_and_index_true(self): _skip_if_no_xlwt() cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + ('2014', 'height'), + ('2014', 'weight')]) df = pd.DataFrame(np.random.randn(10, 3), columns=cols) with ensure_clean(self.ext) as path: df.to_excel(path, index=True) @@ -1867,8 +1897,8 @@ def test_excel_multiindex_index(self): _skip_if_no_xlwt() # MultiIndex as index works so assert no error #9794 cols = MultiIndex.from_tuples([('site', ''), - ('2014', 'height'), - ('2014', 'weight')]) + ('2014', 'height'), + ('2014', 'weight')]) df = DataFrame(np.random.randn(3, 10), index=cols) with ensure_clean(self.ext) as path: df.to_excel(path, index=False) @@ -1975,7 +2005,7 @@ def test_ExcelWriter_dispatch(self): ExcelWriter('nothing') try: - import xlsxwriter + import xlsxwriter # noqa writer_klass = _XlsxWriter except ImportError: _skip_if_no_openpyxl() diff --git a/pandas/io/tests/test_ga.py b/pandas/io/tests/test_ga.py index 965b3441d7405..b8b698691a9f5 100644 --- a/pandas/io/tests/test_ga.py +++ b/pandas/io/tests/test_ga.py @@ -1,3 +1,5 @@ +# flake8: noqa + import os from datetime import datetime diff --git a/pandas/io/tests/test_gbq.py b/pandas/io/tests/test_gbq.py index cc1e901d8f119..88a1e3e0a5cc3 100644 --- a/pandas/io/tests/test_gbq.py +++ b/pandas/io/tests/test_gbq.py @@ -31,7 +31,7 @@ def _test_imports(): global _GOOGLE_API_CLIENT_INSTALLED, _GOOGLE_API_CLIENT_VALID_VERSION, \ - _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED + _HTTPLIB2_INSTALLED, _SETUPTOOLS_INSTALLED try: import pkg_resources @@ -46,25 +46,27 @@ def _test_imports(): if _SETUPTOOLS_INSTALLED: try: - from apiclient.discovery import build - from apiclient.errors import HttpError + from apiclient.discovery import build # noqa + from apiclient.errors import HttpError # noqa - from oauth2client.client import OAuth2WebServerFlow - from oauth2client.client import AccessTokenRefreshError + from oauth2client.client import OAuth2WebServerFlow # noqa + from oauth2client.client import AccessTokenRefreshError # noqa - from oauth2client.file import Storage - from oauth2client.tools import run_flow - _GOOGLE_API_CLIENT_INSTALLED=True - _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution('google-api-python-client').version + from oauth2client.file import Storage # noqa + from oauth2client.tools import run_flow # noqa + _GOOGLE_API_CLIENT_INSTALLED = True + _GOOGLE_API_CLIENT_VERSION = pkg_resources.get_distribution( + 'google-api-python-client').version - if StrictVersion(_GOOGLE_API_CLIENT_VERSION) >= StrictVersion(google_api_minimum_version): + if (StrictVersion(_GOOGLE_API_CLIENT_VERSION) >= + StrictVersion(google_api_minimum_version)): _GOOGLE_API_CLIENT_VALID_VERSION = True except ImportError: _GOOGLE_API_CLIENT_INSTALLED = False try: - import httplib2 + import httplib2 # noqa _HTTPLIB2_INSTALLED = True except ImportError: _HTTPLIB2_INSTALLED = False @@ -76,11 +78,15 @@ def _test_imports(): raise ImportError('Could not import Google API Client.') if not _GOOGLE_API_CLIENT_VALID_VERSION: - raise ImportError("pandas requires google-api-python-client >= {0} for Google BigQuery support, " - "current version {1}".format(google_api_minimum_version, _GOOGLE_API_CLIENT_VERSION)) + raise ImportError("pandas requires google-api-python-client >= {0} " + "for Google BigQuery support, " + "current version {1}" + .format(google_api_minimum_version, + _GOOGLE_API_CLIENT_VERSION)) if not _HTTPLIB2_INSTALLED: - raise ImportError("pandas requires httplib2 for Google BigQuery support") + raise ImportError( + "pandas requires httplib2 for Google BigQuery support") def test_requirements(): @@ -110,13 +116,14 @@ def make_mixed_dataframe_v2(test_size): flts = np.random.randn(1, test_size) ints = np.random.randint(1, 10, size=(1, test_size)) strs = np.random.randint(1, 10, size=(1, test_size)).astype(str) - times = [datetime.now(pytz.timezone('US/Arizona')) for t in range(test_size)] + times = [datetime.now(pytz.timezone('US/Arizona')) + for t in range(test_size)] return DataFrame({'bools': bools[0], 'flts': flts[0], 'ints': ints[0], 'strs': strs[0], 'times': times[0]}, - index=range(test_size)) + index=range(test_size)) def test_generate_bq_schema_deprecated(): @@ -125,17 +132,21 @@ def test_generate_bq_schema_deprecated(): df = make_mixed_dataframe_v2(10) gbq.generate_bq_schema(df) + class TestGBQConnectorIntegration(tm.TestCase): + def setUp(self): test_requirements() if not PROJECT_ID: - raise nose.SkipTest("Cannot run integration tests without a project id") + raise nose.SkipTest( + "Cannot run integration tests without a project id") self.sut = gbq.GbqConnector(PROJECT_ID) def test_should_be_able_to_make_a_connector(self): - self.assertTrue(self.sut is not None, 'Could not create a GbqConnector') + self.assertTrue(self.sut is not None, + 'Could not create a GbqConnector') def test_should_be_able_to_get_valid_credentials(self): credentials = self.sut.get_credentials() @@ -156,6 +167,7 @@ def test_should_be_able_to_get_results_from_query(self): class TestReadGBQUnitTests(tm.TestCase): + def setUp(self): test_requirements() @@ -192,7 +204,8 @@ def test_read_gbq_with_no_project_id_given_should_fail(self): gbq.read_gbq('SELECT "1" as NUMBER_1') def test_that_parse_data_works_properly(self): - test_schema = {'fields': [{'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]} + test_schema = {'fields': [ + {'mode': 'NULLABLE', 'name': 'VALID_STRING', 'type': 'STRING'}]} test_page = [{'f': [{'v': 'PI'}]}] test_output = gbq._parse_data(test_schema, test_page) @@ -201,31 +214,36 @@ def test_that_parse_data_works_properly(self): class TestReadGBQIntegration(tm.TestCase): + @classmethod def setUpClass(cls): # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* executing *ALL* tests - # described below. + # put here any instruction you want to execute only *ONCE* *BEFORE* + # executing *ALL* tests described below. if not PROJECT_ID: - raise nose.SkipTest("Cannot run integration tests without a project id") + raise nose.SkipTest( + "Cannot run integration tests without a project id") test_requirements() def setUp(self): # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test is executed. + # put here any instruction you want to be run *BEFORE* *EVERY* test is + # executed. pass @classmethod def tearDownClass(cls): # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* executing all tests. + # put here any instruction you want to execute only *ONCE* *AFTER* + # executing all tests. pass def tearDown(self): # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test is executed. + # put here any instructions you want to be run *AFTER* *EVERY* test is + # executed. pass def test_should_properly_handle_valid_strings(self): @@ -256,7 +274,8 @@ def test_should_properly_handle_null_integers(self): def test_should_properly_handle_valid_floats(self): query = 'SELECT PI() as VALID_FLOAT' df = gbq.read_gbq(query, project_id=PROJECT_ID) - tm.assert_frame_equal(df, DataFrame({'VALID_FLOAT': [3.141592653589793]})) + tm.assert_frame_equal(df, DataFrame( + {'VALID_FLOAT': [3.141592653589793]})) def test_should_properly_handle_null_floats(self): query = 'SELECT FLOAT(NULL) as NULL_FLOAT' @@ -266,12 +285,15 @@ def test_should_properly_handle_null_floats(self): def test_should_properly_handle_timestamp_unix_epoch(self): query = 'SELECT TIMESTAMP("1970-01-01 00:00:00") as UNIX_EPOCH' df = gbq.read_gbq(query, project_id=PROJECT_ID) - tm.assert_frame_equal(df, DataFrame({'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) + tm.assert_frame_equal(df, DataFrame( + {'UNIX_EPOCH': [np.datetime64('1970-01-01T00:00:00.000000Z')]})) def test_should_properly_handle_arbitrary_timestamp(self): query = 'SELECT TIMESTAMP("2004-09-15 05:00:00") as VALID_TIMESTAMP' df = gbq.read_gbq(query, project_id=PROJECT_ID) - tm.assert_frame_equal(df, DataFrame({'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')]})) + tm.assert_frame_equal(df, DataFrame({ + 'VALID_TIMESTAMP': [np.datetime64('2004-09-15T05:00:00.000000Z')] + })) def test_should_properly_handle_null_timestamp(self): query = 'SELECT TIMESTAMP(NULL) as NULL_TIMESTAMP' @@ -310,29 +332,36 @@ def test_unicode_string_conversion_and_normalization(self): def test_index_column(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2" - result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col="STRING_1") - correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") + result_frame = gbq.read_gbq( + query, project_id=PROJECT_ID, index_col="STRING_1") + correct_frame = DataFrame( + {'STRING_1': ['a'], 'STRING_2': ['b']}).set_index("STRING_1") tm.assert_equal(result_frame.index.name, correct_frame.index.name) def test_column_order(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_1', 'STRING_2'] - result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, col_order=col_order) - correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']})[col_order] + result_frame = gbq.read_gbq( + query, project_id=PROJECT_ID, col_order=col_order) + correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': [ + 'b'], 'STRING_3': ['c']})[col_order] tm.assert_frame_equal(result_frame, correct_frame) def test_column_order_plus_index(self): query = "SELECT 'a' as STRING_1, 'b' as STRING_2, 'c' as STRING_3" col_order = ['STRING_3', 'STRING_2'] - result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, index_col='STRING_1', col_order=col_order) - correct_frame = DataFrame({'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) + result_frame = gbq.read_gbq(query, project_id=PROJECT_ID, + index_col='STRING_1', col_order=col_order) + correct_frame = DataFrame( + {'STRING_1': ['a'], 'STRING_2': ['b'], 'STRING_3': ['c']}) correct_frame.set_index('STRING_1', inplace=True) correct_frame = correct_frame[col_order] tm.assert_frame_equal(result_frame, correct_frame) def test_malformed_query(self): with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", project_id=PROJECT_ID) + gbq.read_gbq("SELCET * FORM [publicdata:samples.shakespeare]", + project_id=PROJECT_ID) def test_bad_project_id(self): with tm.assertRaises(gbq.GenericGBQException): @@ -340,19 +369,24 @@ def test_bad_project_id(self): def test_bad_table_name(self): with tm.assertRaises(gbq.GenericGBQException): - gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", project_id=PROJECT_ID) + gbq.read_gbq("SELECT * FROM [publicdata:samples.nope]", + project_id=PROJECT_ID) def test_download_dataset_larger_than_200k_rows(self): test_size = 200005 # Test for known BigQuery bug in datasets larger than 100k rows # http://stackoverflow.com/questions/19145587/bq-py-not-paging-results - df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] GROUP EACH BY id ORDER BY id ASC LIMIT {0}".format(test_size), + df = gbq.read_gbq("SELECT id FROM [publicdata:samples.wikipedia] " + "GROUP EACH BY id ORDER BY id ASC LIMIT {0}" + .format(test_size), project_id=PROJECT_ID) self.assertEqual(len(df.drop_duplicates()), test_size) def test_zero_rows(self): # Bug fix for https://github.com/pydata/pandas/issues/10273 - df = gbq.read_gbq("SELECT title, language FROM [publicdata:samples.wikipedia] where timestamp=-9999999", + df = gbq.read_gbq("SELECT title, language FROM " + "[publicdata:samples.wikipedia] where " + "timestamp=-9999999", project_id=PROJECT_ID) expected_result = DataFrame(columns=['title', 'language']) self.assert_frame_equal(df, expected_result) @@ -361,17 +395,19 @@ def test_zero_rows(self): class TestToGBQIntegration(tm.TestCase): # Changes to BigQuery table schema may take up to 2 minutes as of May 2015 # As a workaround to this issue, each test should use a unique table name. - # Make sure to modify the for loop range in the tearDownClass when a new test is added - # See `Issue 191 `__ + # Make sure to modify the for loop range in the tearDownClass when a new + # test is added See `Issue 191 + # `__ @classmethod def setUpClass(cls): # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *BEFORE* executing *ALL* tests - # described below. + # put here any instruction you want to execute only *ONCE* *BEFORE* + # executing *ALL* tests described below. if not PROJECT_ID: - raise nose.SkipTest("Cannot run integration tests without a project id") + raise nose.SkipTest( + "Cannot run integration tests without a project id") test_requirements() clean_gbq_environment() @@ -380,7 +416,8 @@ def setUpClass(cls): def setUp(self): # - PER-TEST FIXTURES - - # put here any instruction you want to be run *BEFORE* *EVERY* test is executed. + # put here any instruction you want to be run *BEFORE* *EVERY* test is + # executed. self.dataset = gbq._Dataset(PROJECT_ID) self.table = gbq._Table(PROJECT_ID, DATASET_ID + "1") @@ -388,13 +425,15 @@ def setUp(self): @classmethod def tearDownClass(cls): # - GLOBAL CLASS FIXTURES - - # put here any instruction you want to execute only *ONCE* *AFTER* executing all tests. + # put here any instruction you want to execute only *ONCE* *AFTER* + # executing all tests. clean_gbq_environment() def tearDown(self): # - PER-TEST FIXTURES - - # put here any instructions you want to be run *AFTER* *EVERY* test is executed. + # put here any instructions you want to be run *AFTER* *EVERY* test is + # executed. pass def test_upload_data(self): @@ -407,7 +446,8 @@ def test_upload_data(self): sleep(60) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), + result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + .format(destination_table), project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size) @@ -441,12 +481,15 @@ def test_upload_data_if_table_exists_append(self): sleep(60) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) + result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + .format(destination_table), + project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], test_size * 2) # Try inserting with a different schema, confirm failure with tm.assertRaises(gbq.InvalidSchema): - gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='append') + gbq.to_gbq(df_different_schema, destination_table, + PROJECT_ID, if_exists='append') def test_upload_data_if_table_exists_replace(self): destination_table = DESTINATION_TABLE + "4" @@ -459,19 +502,24 @@ def test_upload_data_if_table_exists_replace(self): gbq.to_gbq(df, destination_table, PROJECT_ID, chunksize=10000) # Test the if_exists parameter with the value 'replace'. - gbq.to_gbq(df_different_schema, destination_table, PROJECT_ID, if_exists='replace') + gbq.to_gbq(df_different_schema, destination_table, + PROJECT_ID, if_exists='replace') sleep(60) # <- Curses Google!!! - result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}".format(destination_table), project_id=PROJECT_ID) + result = gbq.read_gbq("SELECT COUNT(*) as NUM_ROWS FROM {0}" + .format(destination_table), + project_id=PROJECT_ID) self.assertEqual(result['NUM_ROWS'][0], 5) def test_google_upload_errors_should_raise_exception(self): destination_table = DESTINATION_TABLE + "5" test_timestamp = datetime.now(pytz.timezone('US/Arizona')) - bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], 'ints': [0, '1'], 'strs': ['a', 1], - 'times': [test_timestamp, test_timestamp]}, index=range(2)) + bad_df = DataFrame({'bools': [False, False], 'flts': [0.0, 1.0], + 'ints': [0, '1'], 'strs': ['a', 1], + 'times': [test_timestamp, test_timestamp]}, + index=range(2)) with tm.assertRaises(gbq.StreamingInsertError): gbq.to_gbq(bad_df, destination_table, PROJECT_ID, verbose=True) @@ -489,56 +537,72 @@ def test_generate_schema(self): def test_create_table(self): destination_table = TABLE_ID + "6" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} self.table.create(destination_table, test_schema) - self.assertTrue(self.table.exists(destination_table), 'Expected table to exist') + self.assertTrue(self.table.exists(destination_table), + 'Expected table to exist') def test_table_does_not_exist(self): - self.assertTrue(not self.table.exists(TABLE_ID + "7"), 'Expected table not to exist') + self.assertTrue(not self.table.exists(TABLE_ID + "7"), + 'Expected table not to exist') def test_delete_table(self): destination_table = TABLE_ID + "8" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} self.table.create(destination_table, test_schema) self.table.delete(destination_table) - self.assertTrue(not self.table.exists(destination_table), 'Expected table not to exist') + self.assertTrue(not self.table.exists( + destination_table), 'Expected table not to exist') def test_list_table(self): destination_table = TABLE_ID + "9" - test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, {'name': 'B', 'type': 'FLOAT'}, - {'name': 'C', 'type': 'STRING'}, {'name': 'D', 'type': 'TIMESTAMP'}]} + test_schema = {'fields': [{'name': 'A', 'type': 'FLOAT'}, + {'name': 'B', 'type': 'FLOAT'}, + {'name': 'C', 'type': 'STRING'}, + {'name': 'D', 'type': 'TIMESTAMP'}]} self.table.create(destination_table, test_schema) - self.assertTrue(destination_table in self.dataset.tables(DATASET_ID + "1"), - 'Expected table list to contain table {0}'.format(destination_table)) + self.assertTrue( + destination_table in self.dataset.tables(DATASET_ID + "1"), + 'Expected table list to contain table {0}' + .format(destination_table)) def test_list_dataset(self): dataset_id = DATASET_ID + "1" self.assertTrue(dataset_id in self.dataset.datasets(), - 'Expected dataset list to contain dataset {0}'.format(dataset_id)) + 'Expected dataset list to contain dataset {0}' + .format(dataset_id)) def test_list_table_zero_results(self): dataset_id = DATASET_ID + "2" self.dataset.create(dataset_id) table_list = gbq._Dataset(PROJECT_ID).tables(dataset_id) - self.assertEqual(len(table_list), 0, 'Expected gbq.list_table() to return 0') + self.assertEqual(len(table_list), 0, + 'Expected gbq.list_table() to return 0') def test_create_dataset(self): dataset_id = DATASET_ID + "3" self.dataset.create(dataset_id) - self.assertTrue(dataset_id in self.dataset.datasets(), 'Expected dataset to exist') + self.assertTrue(dataset_id in self.dataset.datasets(), + 'Expected dataset to exist') def test_delete_dataset(self): dataset_id = DATASET_ID + "4" self.dataset.create(dataset_id) self.dataset.delete(dataset_id) - self.assertTrue(dataset_id not in self.dataset.datasets(), 'Expected dataset not to exist') + self.assertTrue(dataset_id not in self.dataset.datasets(), + 'Expected dataset not to exist') def test_dataset_exists(self): dataset_id = DATASET_ID + "5" self.dataset.create(dataset_id) - self.assertTrue(self.dataset.exists(dataset_id), 'Expected dataset to exist') + self.assertTrue(self.dataset.exists(dataset_id), + 'Expected dataset to exist') def create_table_data_dataset_does_not_exist(self): dataset_id = DATASET_ID + "6" @@ -546,11 +610,14 @@ def create_table_data_dataset_does_not_exist(self): table_with_new_dataset = gbq._Table(PROJECT_ID, dataset_id) df = make_mixed_dataframe_v2(10) table_with_new_dataset.create(table_id, gbq._generate_bq_schema(df)) - self.assertTrue(self.dataset.exists(dataset_id), 'Expected dataset to exist') - self.assertTrue(table_with_new_dataset.exists(table_id), 'Expected dataset to exist') + self.assertTrue(self.dataset.exists(dataset_id), + 'Expected dataset to exist') + self.assertTrue(table_with_new_dataset.exists( + table_id), 'Expected dataset to exist') def test_dataset_does_not_exist(self): - self.assertTrue(not self.dataset.exists(DATASET_ID + "_not_found"), 'Expected dataset not to exist') + self.assertTrue(not self.dataset.exists( + DATASET_ID + "_not_found"), 'Expected dataset not to exist') if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], diff --git a/pandas/io/tests/test_html.py b/pandas/io/tests/test_html.py index 141533a131e42..9a18da7d57648 100644 --- a/pandas/io/tests/test_html.py +++ b/pandas/io/tests/test_html.py @@ -20,7 +20,8 @@ from pandas import (DataFrame, MultiIndex, read_csv, Timestamp, Index, date_range, Series) -from pandas.compat import map, zip, StringIO, string_types, BytesIO, is_platform_windows +from pandas.compat import (map, zip, StringIO, string_types, BytesIO, + is_platform_windows) from pandas.io.common import URLError, urlopen, file_path_to_url from pandas.io.html import read_html from pandas.parser import CParserError @@ -87,6 +88,7 @@ def test_bs4_version_fails(): class ReadHtmlMixin(object): + def read_html(self, *args, **kwargs): kwargs.setdefault('flavor', self.flavor) return read_html(*args, **kwargs) @@ -437,8 +439,9 @@ def test_tfoot_read(self): ''' - data1 = data_template.format(footer = "") - data2 = data_template.format(footer ="footAfootB") + data1 = data_template.format(footer="") + data2 = data_template.format( + footer="footAfootB") d1 = {'A': ['bodyA'], 'B': ['bodyB']} d2 = {'A': ['bodyA', 'footA'], 'B': ['bodyB', 'footB']} @@ -528,10 +531,10 @@ def try_remove_ws(x): dfnew = df.applymap(try_remove_ws).replace(old, new) gtnew = ground_truth.applymap(try_remove_ws) converted = dfnew._convert(datetime=True, numeric=True) - date_cols = ['Closing Date','Updated Date'] + date_cols = ['Closing Date', 'Updated Date'] converted[date_cols] = converted[date_cols]._convert(datetime=True, coerce=True) - tm.assert_frame_equal(converted,gtnew) + tm.assert_frame_equal(converted, gtnew) @slow def test_gold_canyon(self): @@ -638,11 +641,12 @@ def test_wikipedia_states_table(self): nose.tools.assert_equal(result['sq mi'].dtype, np.dtype('float64')) def test_bool_header_arg(self): - #GH 6114 + # GH 6114 for arg in [True, False]: with tm.assertRaises(TypeError): read_html(self.spam_data, header=arg) + def _lang_enc(filename): return os.path.splitext(os.path.basename(filename))[0].split('_') @@ -682,14 +686,14 @@ def test_encode(self): from_filename = self.read_filename(f, encoding).pop() tm.assert_frame_equal(from_string, from_file_like) tm.assert_frame_equal(from_string, from_filename) - except Exception as e: - + except Exception: # seems utf-16/32 fail on windows if is_platform_windows(): if '16' in encoding or '32' in encoding: continue raise + class TestReadHtmlEncodingLxml(TestReadHtmlEncoding): flavor = 'lxml' diff --git a/pandas/io/tests/test_json/test_pandas.py b/pandas/io/tests/test_json/test_pandas.py index 1690667ef743b..2889acef8180d 100644 --- a/pandas/io/tests/test_json/test_pandas.py +++ b/pandas/io/tests/test_json/test_pandas.py @@ -3,7 +3,7 @@ import os import numpy as np -from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, CategoricalIndex, +from pandas import (Series, DataFrame, DatetimeIndex, Timestamp, read_json, compat) from datetime import timedelta import pandas as pd @@ -23,13 +23,15 @@ _tsframe = DataFrame(_tsd) _cat_frame = _frame.copy() -cat = ['bah']*5 + ['bar']*5 + ['baz']*5 + ['foo']*(len(_cat_frame)-15) -_cat_frame.index = pd.CategoricalIndex(cat,name='E') +cat = ['bah'] * 5 + ['bar'] * 5 + ['baz'] * \ + 5 + ['foo'] * (len(_cat_frame) - 15) +_cat_frame.index = pd.CategoricalIndex(cat, name='E') _cat_frame['E'] = list(reversed(cat)) -_cat_frame['sort'] = np.arange(len(_cat_frame),dtype='int64') +_cat_frame['sort'] = np.arange(len(_cat_frame), dtype='int64') _mixed_frame = _frame.copy() + class TestPandasContainer(tm.TestCase): def setUp(self): @@ -116,7 +118,8 @@ def test_frame_non_unique_columns(self): np.testing.assert_equal(df.values, unser.values) # GH4377; duplicate columns not processing correctly - df = DataFrame([['a','b'],['c','d']], index=[1,2], columns=['x','y']) + df = DataFrame([['a', 'b'], ['c', 'd']], index=[ + 1, 2], columns=['x', 'y']) result = read_json(df.to_json(orient='split'), orient='split') assert_frame_equal(result, df) @@ -125,11 +128,12 @@ def _check(df): convert_dates=['x']) assert_frame_equal(result, df) - for o in [[['a','b'],['c','d']], - [[1.5,2.5],[3.5,4.5]], - [[1,2.5],[3,4.5]], - [[Timestamp('20130101'),3.5],[Timestamp('20130102'),4.5]]]: - _check(DataFrame(o, index=[1,2], columns=['x','x'])) + for o in [[['a', 'b'], ['c', 'd']], + [[1.5, 2.5], [3.5, 4.5]], + [[1, 2.5], [3, 4.5]], + [[Timestamp('20130101'), 3.5], + [Timestamp('20130102'), 4.5]]]: + _check(DataFrame(o, index=[1, 2], columns=['x', 'x'])) def test_frame_from_json_to_json(self): def _check_orient(df, orient, dtype=None, numpy=False, @@ -143,11 +147,14 @@ def _check_orient(df, orient, dtype=None, numpy=False, # if we are not unique, then check that we are raising ValueError # for the appropriate orients - if not df.index.is_unique and orient in ['index','columns']: - self.assertRaises(ValueError, lambda : df.to_json(orient=orient)) + if not df.index.is_unique and orient in ['index', 'columns']: + self.assertRaises( + ValueError, lambda: df.to_json(orient=orient)) return - if not df.columns.is_unique and orient in ['index','columns','records']: - self.assertRaises(ValueError, lambda : df.to_json(orient=orient)) + if (not df.columns.is_unique and + orient in ['index', 'columns', 'records']): + self.assertRaises( + ValueError, lambda: df.to_json(orient=orient)) return dfjson = df.to_json(orient=orient) @@ -167,7 +174,7 @@ def _check_orient(df, orient, dtype=None, numpy=False, unser = unser.sort_index() if dtype is False: - check_dtype=False + check_dtype = False if not convert_axes and df.index.dtype.type == np.datetime64: unser.index = DatetimeIndex( @@ -199,8 +206,8 @@ def _check_orient(df, orient, dtype=None, numpy=False, assert_frame_equal(df, unser, check_less_precise=False, check_dtype=check_dtype) - def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, - sort=None, check_index_type=True, + def _check_all_orients(df, dtype=None, convert_axes=True, + raise_ok=None, sort=None, check_index_type=True, check_column_type=True): # numpy=False @@ -216,11 +223,16 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, _check_orient(df, "values", dtype=dtype, sort=sort, check_index_type=False, check_column_type=False) - _check_orient(df, "columns", dtype=dtype, convert_axes=False, sort=sort) - _check_orient(df, "records", dtype=dtype, convert_axes=False, sort=sort) - _check_orient(df, "split", dtype=dtype, convert_axes=False, sort=sort) - _check_orient(df, "index", dtype=dtype, convert_axes=False, sort=sort) - _check_orient(df, "values", dtype=dtype ,convert_axes=False, sort=sort) + _check_orient(df, "columns", dtype=dtype, + convert_axes=False, sort=sort) + _check_orient(df, "records", dtype=dtype, + convert_axes=False, sort=sort) + _check_orient(df, "split", dtype=dtype, + convert_axes=False, sort=sort) + _check_orient(df, "index", dtype=dtype, + convert_axes=False, sort=sort) + _check_orient(df, "values", dtype=dtype, + convert_axes=False, sort=sort) # numpy=True and raise_ok might be not None, so ignore the error if convert_axes: @@ -265,7 +277,7 @@ def _check_all_orients(df, dtype=None, convert_axes=True, raise_ok=None, biggie = DataFrame(np.zeros((200, 4)), columns=[str(i) for i in range(4)], index=[str(i) for i in range(200)]) - _check_all_orients(biggie,dtype=False, convert_axes=False) + _check_all_orients(biggie, dtype=False, convert_axes=False) # dtypes _check_all_orients(DataFrame(biggie, dtype=np.float64), @@ -336,31 +348,32 @@ def test_frame_from_json_nones(self): df = DataFrame([['1', '2'], ['4', '5', '6']]) unser = read_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(),dtype=False) + unser = read_json(df.to_json(), dtype=False) self.assertTrue(unser[2][0] is None) - unser = read_json(df.to_json(),convert_axes=False,dtype=False) + unser = read_json(df.to_json(), convert_axes=False, dtype=False) self.assertTrue(unser['2']['0'] is None) unser = read_json(df.to_json(), numpy=False) self.assertTrue(np.isnan(unser[2][0])) unser = read_json(df.to_json(), numpy=False, dtype=False) self.assertTrue(unser[2][0] is None) - unser = read_json(df.to_json(), numpy=False, convert_axes=False, dtype=False) + unser = read_json(df.to_json(), numpy=False, + convert_axes=False, dtype=False) self.assertTrue(unser['2']['0'] is None) # infinities get mapped to nulls which get mapped to NaNs during # deserialisation df = DataFrame([[1, 2], [4, 5, 6]]) - df.loc[0,2] = np.inf + df.loc[0, 2] = np.inf unser = read_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) unser = read_json(df.to_json(), dtype=False) self.assertTrue(np.isnan(unser[2][0])) - df.loc[0,2] = np.NINF + df.loc[0, 2] = np.NINF unser = read_json(df.to_json()) self.assertTrue(np.isnan(unser[2][0])) - unser = read_json(df.to_json(),dtype=False) + unser = read_json(df.to_json(), dtype=False) self.assertTrue(np.isnan(unser[2][0])) def test_frame_to_json_except(self): @@ -410,11 +423,11 @@ def test_frame_mixedtype_orient(self): # GH10289 def test_v12_compat(self): df = DataFrame( - [[1.56808523, 0.65727391, 1.81021139, -0.17251653], + [[1.56808523, 0.65727391, 1.81021139, -0.17251653], [-0.2550111, -0.08072427, -0.03202878, -0.17581665], - [1.51493992, 0.11805825, 1.629455, -1.31506612], - [-0.02765498, 0.44679743, 0.33192641, -0.27885413], - [0.05951614, -2.69652057, 1.28163262, 0.34703478]], + [1.51493992, 0.11805825, 1.629455, -1.31506612], + [-0.02765498, 0.44679743, 0.33192641, -0.27885413], + [0.05951614, -2.69652057, 1.28163262, 0.34703478]], columns=['A', 'B', 'C', 'D'], index=pd.date_range('2000-01-03', '2000-01-07')) df['date'] = pd.Timestamp('19920106 18:21:32.12') @@ -438,10 +451,10 @@ def test_blocks_compat_GH9037(self): -0.60316077, 0.24653374, 0.28668979, -2.51969012, 0.95748401, -1.02970536], int_1=[19680418, 75337055, 99973684, 65103179, 79373900, - 40314334, 21290235, 4991321, 41903419, 16008365], + 40314334, 21290235, 4991321, 41903419, 16008365], str_1=['78c608f1', '64a99743', '13d2ff52', 'ca7f4af2', '97236474', 'bde7e214', '1a6bde47', 'b1190be5', '7a669144', '8d64d068'], - float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685, + float_2=[-0.0428278, -1.80872357, 3.36042349, -0.7573685, -0.48217572, 0.86229683, 1.08935819, 0.93898739, -0.03030452, 1.43366348], str_2=['14f04af9', 'd085da90', '4bcfac83', '81504caf', '2ffef4a9', @@ -468,7 +481,7 @@ def test_series_non_unique_index(self): self.assertRaises(ValueError, s.to_json, orient='index') assert_series_equal(s, read_json(s.to_json(orient='split'), - orient='split', typ='series')) + orient='split', typ='series')) unser = read_json(s.to_json(orient='records'), orient='records', typ='series') np.testing.assert_equal(s.values, unser.values) @@ -532,7 +545,7 @@ def _check_all_orients(series, dtype=None, check_index_type=True): _check_all_orients(self.ts) # dtype - s = Series(lrange(6), index=['a','b','c','d','e','f']) + s = Series(lrange(6), index=['a', 'b', 'c', 'd', 'e', 'f']) _check_all_orients(Series(s, dtype=np.float64), dtype=np.float64) _check_all_orients(Series(s, dtype=np.int), dtype=np.int) @@ -548,12 +561,14 @@ def test_series_from_json_precise_float(self): def test_frame_from_json_precise_float(self): df = DataFrame([[4.56, 4.56, 4.56], [4.56, 4.56, 4.56]]) result = read_json(df.to_json(), precise_float=True) - assert_frame_equal(result, df, check_index_type=False, check_column_type=False) + assert_frame_equal(result, df, check_index_type=False, + check_column_type=False) def test_typ(self): - s = Series(lrange(6), index=['a','b','c','d','e','f'], dtype='int64') - result = read_json(s.to_json(),typ=None) + s = Series(lrange(6), index=['a', 'b', 'c', + 'd', 'e', 'f'], dtype='int64') + result = read_json(s.to_json(), typ=None) assert_series_equal(result, s) def test_reconstruction_index(self): @@ -563,7 +578,8 @@ def test_reconstruction_index(self): self.assertEqual(result.index.dtype, np.float64) self.assertEqual(result.columns.dtype, np.float64) - assert_frame_equal(result, df, check_index_type=False, check_column_type=False) + assert_frame_equal(result, df, check_index_type=False, + check_column_type=False) df = DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=['A', 'B', 'C']) result = read_json(df.to_json()) @@ -614,12 +630,13 @@ def test_convert_dates(self): assert_series_equal(result, ts) def test_convert_dates_infer(self): - #GH10747 + # GH10747 infer_words = ['trade_time', 'date', 'datetime', 'sold_at', 'modified', 'timestamp', 'timestamps'] for infer_word in infer_words: data = [{'id': 1, infer_word: 1036713600000}, {'id': 2}] - expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]], columns=['id', infer_word]) + expected = DataFrame([[1, Timestamp('2002-11-08')], [2, pd.NaT]], + columns=['id', infer_word]) result = read_json(pd.json.dumps(data))[['id', infer_word]] assert_frame_equal(result, expected) @@ -713,17 +730,17 @@ def test_doc_example(self): dfj2['date'] = Timestamp('20130101') dfj2['ints'] = lrange(5) dfj2['bools'] = True - dfj2.index = pd.date_range('20130101',periods=5) + dfj2.index = pd.date_range('20130101', periods=5) json = dfj2.to_json() - result = read_json(json,dtype={'ints' : np.int64, 'bools' : np.bool_}) - assert_frame_equal(result,result) + result = read_json(json, dtype={'ints': np.int64, 'bools': np.bool_}) + assert_frame_equal(result, result) def test_misc_example(self): # parsing unordered input fails result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]', numpy=True) - expected = DataFrame([[1,2], [1,2]], columns=['a', 'b']) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) error_msg = """DataFrame\\.index are different @@ -734,7 +751,7 @@ def test_misc_example(self): assert_frame_equal(result, expected, check_index_type=False) result = read_json('[{"a": 1, "b": 2}, {"b":2, "a" :1}]') - expected = DataFrame([[1,2], [1,2]], columns=['a','b']) + expected = DataFrame([[1, 2], [1, 2]], columns=['a', 'b']) assert_frame_equal(result, expected) @network @@ -744,7 +761,8 @@ def test_round_trip_exception_(self): df = pd.read_csv(csv) s = df.to_json() result = pd.read_json(s) - assert_frame_equal(result.reindex(index=df.index,columns=df.columns),df) + assert_frame_equal(result.reindex( + index=df.index, columns=df.columns), df) @network def test_url(self): @@ -754,22 +772,27 @@ def test_url(self): self.assertEqual(result[c].dtype, 'datetime64[ns]') def test_timedelta(self): - converter = lambda x: pd.to_timedelta(x,unit='ms') + converter = lambda x: pd.to_timedelta(x, unit='ms') s = Series([timedelta(23), timedelta(seconds=5)]) self.assertEqual(s.dtype, 'timedelta64[ns]') # index will be float dtype - assert_series_equal(s, pd.read_json(s.to_json(),typ='series').apply(converter), + assert_series_equal(s, pd.read_json(s.to_json(), typ='series') + .apply(converter), check_index_type=False) - s = Series([timedelta(23), timedelta(seconds=5)], index=pd.Index([0, 1], dtype=float)) + s = Series([timedelta(23), timedelta(seconds=5)], + index=pd.Index([0, 1], dtype=float)) self.assertEqual(s.dtype, 'timedelta64[ns]') - assert_series_equal(s, pd.read_json(s.to_json(), typ='series').apply(converter)) + assert_series_equal(s, pd.read_json( + s.to_json(), typ='series').apply(converter)) frame = DataFrame([timedelta(23), timedelta(seconds=5)]) - self.assertEqual(frame[0].dtype,'timedelta64[ns]') - assert_frame_equal(frame, pd.read_json(frame.to_json()).apply(converter), - check_index_type=False, check_column_type=False) + self.assertEqual(frame[0].dtype, 'timedelta64[ns]') + assert_frame_equal(frame, pd.read_json(frame.to_json()) + .apply(converter), + check_index_type=False, + check_column_type=False) frame = DataFrame({'a': [timedelta(days=23), timedelta(seconds=5)], 'b': [1, 2], @@ -800,7 +823,8 @@ def test_default_handler(self): def test_default_handler_raises(self): def my_handler_raises(obj): raise TypeError("raisin") - self.assertRaises(TypeError, DataFrame({'a': [1, 2, object()]}).to_json, + self.assertRaises(TypeError, + DataFrame({'a': [1, 2, object()]}).to_json, default_handler=my_handler_raises) diff --git a/pandas/io/tests/test_json/test_ujson.py b/pandas/io/tests/test_json/test_ujson.py index 9590dbb90e5c6..f5efb54099ddd 100644 --- a/pandas/io/tests/test_json/test_ujson.py +++ b/pandas/io/tests/test_json/test_ujson.py @@ -24,7 +24,6 @@ from numpy.testing import (assert_array_almost_equal_nulp, assert_approx_equal) import pytz -import dateutil from pandas import DataFrame, Series, Index, NaT, DatetimeIndex import pandas.util.testing as tm @@ -38,6 +37,7 @@ def _skip_if_python_ver(skip_major, skip_minor=None): json_unicode = (json.dumps if compat.PY3 else partial(json.dumps, encoding="utf-8")) + class UltraJSONTests(TestCase): def test_encodeDecimal(self): @@ -48,8 +48,10 @@ def test_encodeDecimal(self): def test_encodeStringConversion(self): input = "A string \\ / \b \f \n \r \t &" - not_html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t <\\/script> &"' - html_encoded = '"A string \\\\ \\/ \\b \\f \\n \\r \\t \\u003c\\/script\\u003e \\u0026"' + not_html_encoded = ('"A string \\\\ \\/ \\b \\f \\n ' + '\\r \\t <\\/script> &"') + html_encoded = ('"A string \\\\ \\/ \\b \\f \\n \\r \\t ' + '\\u003c\\/script\\u003e \\u0026"') def helper(expected_output, **encode_kwargs): output = ujson.encode(input, **encode_kwargs) @@ -127,18 +129,16 @@ def test_encodeDoubleTinyExponential(self): def test_encodeDictWithUnicodeKeys(self): input = {u("key1"): u("value1"), u("key1"): - u("value1"), u("key1"): u("value1"), - u("key1"): u("value1"), u("key1"): - u("value1"), u("key1"): u("value1")} + u("value1"), u("key1"): u("value1"), + u("key1"): u("value1"), u("key1"): + u("value1"), u("key1"): u("value1")} output = ujson.encode(input) input = {u("بن"): u("value1"), u("بن"): u("value1"), - u("بن"): u("value1"), u("بن"): u("value1"), - u("بن"): u("value1"), u("بن"): u("value1"), - u("بن"): u("value1")} - output = ujson.encode(input) - - pass + u("بن"): u("value1"), u("بن"): u("value1"), + u("بن"): u("value1"), u("بن"): u("value1"), + u("بن"): u("value1")} + output = ujson.encode(input) # noqa def test_encodeDoubleConversion(self): input = math.pi @@ -162,45 +162,48 @@ def test_encodeArrayOfNestedArrays(self): input = [[[[]]]] * 20 output = ujson.encode(input) self.assertEqual(input, json.loads(output)) - #self.assertEqual(output, json.dumps(input)) + # self.assertEqual(output, json.dumps(input)) self.assertEqual(input, ujson.decode(output)) input = np.array(input) - tm.assert_numpy_array_equal(input, ujson.decode(output, numpy=True, dtype=input.dtype)) + tm.assert_numpy_array_equal(input, ujson.decode( + output, numpy=True, dtype=input.dtype)) def test_encodeArrayOfDoubles(self): - input = [ 31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 + input = [31337.31337, 31337.31337, 31337.31337, 31337.31337] * 10 output = ujson.encode(input) self.assertEqual(input, json.loads(output)) - #self.assertEqual(output, json.dumps(input)) + # self.assertEqual(output, json.dumps(input)) self.assertEqual(input, ujson.decode(output)) - tm.assert_numpy_array_equal(np.array(input), ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(input), ujson.decode(output, numpy=True)) def test_doublePrecisionTest(self): input = 30.012345678901234 - output = ujson.encode(input, double_precision = 15) + output = ujson.encode(input, double_precision=15) self.assertEqual(input, json.loads(output)) self.assertEqual(input, ujson.decode(output)) - output = ujson.encode(input, double_precision = 9) + output = ujson.encode(input, double_precision=9) self.assertEqual(round(input, 9), json.loads(output)) self.assertEqual(round(input, 9), ujson.decode(output)) - output = ujson.encode(input, double_precision = 3) + output = ujson.encode(input, double_precision=3) self.assertEqual(round(input, 3), json.loads(output)) self.assertEqual(round(input, 3), ujson.decode(output)) def test_invalidDoublePrecision(self): input = 30.12345678901234567890 - self.assertRaises(ValueError, ujson.encode, input, double_precision = 20) - self.assertRaises(ValueError, ujson.encode, input, double_precision = -1) + self.assertRaises(ValueError, ujson.encode, input, double_precision=20) + self.assertRaises(ValueError, ujson.encode, input, double_precision=-1) # will throw typeError - self.assertRaises(TypeError, ujson.encode, input, double_precision = '9') + self.assertRaises(TypeError, ujson.encode, input, double_precision='9') # will throw typeError - self.assertRaises(TypeError, ujson.encode, input, double_precision = None) + self.assertRaises(TypeError, ujson.encode, + input, double_precision=None) - def test_encodeStringConversion(self): + def test_encodeStringConversion2(self): input = "A string \\ / \b \f \n \r \t" output = ujson.encode(input) self.assertEqual(input, json.loads(output)) @@ -270,7 +273,8 @@ def test_encodeArrayInArray(self): self.assertEqual(input, json.loads(output)) self.assertEqual(output, json.dumps(input)) self.assertEqual(input, ujson.decode(output)) - tm.assert_numpy_array_equal(np.array(input), ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(input), ujson.decode(output, numpy=True)) pass def test_encodeIntConversion(self): @@ -293,25 +297,22 @@ def test_encodeLongNegConversion(self): input = -9223372036854775808 output = ujson.encode(input) - outputjson = json.loads(output) - outputujson = ujson.decode(output) - self.assertEqual(input, json.loads(output)) self.assertEqual(output, json.dumps(input)) self.assertEqual(input, ujson.decode(output)) - pass def test_encodeListConversion(self): - input = [ 1, 2, 3, 4 ] + input = [1, 2, 3, 4] output = ujson.encode(input) self.assertEqual(input, json.loads(output)) self.assertEqual(input, ujson.decode(output)) - tm.assert_numpy_array_equal(np.array(input), ujson.decode(output, numpy=True)) + tm.assert_numpy_array_equal( + np.array(input), ujson.decode(output, numpy=True)) pass def test_encodeDictConversion(self): - input = { "k1": 1, "k2": 2, "k3": 3, "k4": 4 } - output = ujson.encode(input) + input = {"k1": 1, "k2": 2, "k3": 3, "k4": 4} + output = ujson.encode(input) # noqa self.assertEqual(input, json.loads(output)) self.assertEqual(input, ujson.decode(output)) self.assertEqual(input, ujson.decode(output)) @@ -365,8 +366,9 @@ def test_encodeTimeConversion(self): datetime.time(1, 2, 3), datetime.time(10, 12, 15, 343243), datetime.time(10, 12, 15, 343243, pytz.utc), -# datetime.time(10, 12, 15, 343243, dateutil.tz.gettz('UTC')), # this segfaults! No idea why. - ] + # datetime.time(10, 12, 15, 343243, dateutil.tz.gettz('UTC')), # + # this segfaults! No idea why. + ] for test in tests: output = ujson.encode(test) expected = '"%s"' % test.isoformat() @@ -435,7 +437,7 @@ class O1: input.member.member = input try: - output = ujson.encode(input) + output = ujson.encode(input) # noqa assert False, "Expected overflow exception" except(OverflowError): pass @@ -575,7 +577,7 @@ def test_decodeBrokenDictKeyTypeLeakTest(self): try: ujson.decode(input) assert False, "Expected exception!" - except ValueError as e: + except ValueError: continue assert False, "Wrong exception" @@ -644,7 +646,7 @@ def test_encodeUnicode4BytesUTF8Fail(self): _skip_if_python_ver(3) input = "\xfd\xbf\xbf\xbf\xbf\xbf" try: - enc = ujson.encode(input) + enc = ujson.encode(input) # noqa assert False, "Expected exception" except OverflowError: pass @@ -671,12 +673,13 @@ def test_decodeNullCharacter(self): def test_encodeListLongConversion(self): input = [9223372036854775807, 9223372036854775807, 9223372036854775807, - 9223372036854775807, 9223372036854775807, 9223372036854775807 ] + 9223372036854775807, 9223372036854775807, 9223372036854775807] output = ujson.encode(input) self.assertEqual(input, json.loads(output)) self.assertEqual(input, ujson.decode(output)) - tm.assert_numpy_array_equal(np.array(input), ujson.decode(output, numpy=True, - dtype=np.int64)) + tm.assert_numpy_array_equal(np.array(input), + ujson.decode(output, numpy=True, + dtype=np.int64)) pass def test_encodeLongConversion(self): @@ -734,8 +737,10 @@ def test_dumpToFile(self): def test_dumpToFileLikeObject(self): class filelike: + def __init__(self): self.bytes = '' + def write(self, bytes): self.bytes += bytes f = filelike() @@ -754,10 +759,12 @@ def test_loadFile(self): f = StringIO("[1,2,3,4]") self.assertEqual([1, 2, 3, 4], ujson.load(f)) f = StringIO("[1,2,3,4]") - tm.assert_numpy_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + tm.assert_numpy_array_equal( + np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) def test_loadFileLikeObject(self): class filelike: + def read(self): try: self.end @@ -767,7 +774,8 @@ def read(self): f = filelike() self.assertEqual([1, 2, 3, 4], ujson.load(f)) f = filelike() - tm.assert_numpy_array_equal(np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) + tm.assert_numpy_array_equal( + np.array([1, 2, 3, 4]), ujson.load(f, numpy=True)) def test_loadFileArgsError(self): try: @@ -779,7 +787,7 @@ def test_loadFileArgsError(self): def test_version(self): assert re.match(r'^\d+\.\d+(\.\d+)?$', ujson.__version__), \ - "ujson.__version__ must be a string like '1.4.0'" + "ujson.__version__ must be a string like '1.4.0'" def test_encodeNumericOverflow(self): try: @@ -804,18 +812,18 @@ class Nested: assert False, "expected OverflowError" def test_decodeNumberWith32bitSignBit(self): - #Test that numbers that fit within 32 bits but would have the + # Test that numbers that fit within 32 bits but would have the # sign bit set (2**31 <= x < 2**32) are decoded properly. - boundary1 = 2**31 - boundary2 = 2**32 + boundary1 = 2**31 # noqa + boundary2 = 2**32 # noqa docs = ( '{"id": 3590016419}', '{"id": %s}' % 2**31, '{"id": %s}' % 2**32, - '{"id": %s}' % ((2**32)-1), + '{"id": %s}' % ((2**32) - 1), ) - results = (3590016419, 2**31, 2**32, 2**32-1) - for doc,result in zip(docs, results): + results = (3590016419, 2**31, 2**32, 2**32 - 1) + for doc, result in zip(docs, results): self.assertEqual(ujson.decode(doc)['id'], result) def test_encodeBigEscape(self): @@ -825,7 +833,7 @@ def test_encodeBigEscape(self): else: base = "\xc3\xa5" input = base * 1024 * 1024 * 2 - output = ujson.encode(input) + output = ujson.encode(input) # noqa def test_decodeBigEscape(self): for x in range(10): @@ -835,12 +843,13 @@ def test_decodeBigEscape(self): base = "\xc3\xa5" quote = compat.str_to_bytes("\"") input = quote + (base * 1024 * 1024 * 2) + quote - output = ujson.decode(input) + output = ujson.decode(input) # noqa def test_toDict(self): d = {u("key"): 31337} class DictTest: + def toDict(self): return d @@ -865,12 +874,12 @@ def __str__(self): self.assertRaises(OverflowError, ujson.encode, _TestObject("foo")) self.assertEqual('"foo"', ujson.encode(_TestObject("foo"), - default_handler=str)) + default_handler=str)) def my_handler(obj): return "foobar" self.assertEqual('"foobar"', ujson.encode(_TestObject("foo"), - default_handler=my_handler)) + default_handler=my_handler)) def my_handler_raises(obj): raise TypeError("I raise for anything") @@ -892,7 +901,7 @@ def my_obj_handler(obj): l = [_TestObject("foo"), _TestObject("bar")] self.assertEqual(json.loads(json.dumps(l, default=str)), - ujson.decode(ujson.encode(l, default_handler=str))) + ujson.decode(ujson.encode(l, default_handler=str))) class NumpyJSONTests(TestCase): @@ -902,8 +911,8 @@ def testBool(self): self.assertEqual(ujson.decode(ujson.encode(b)), b) def testBoolArray(self): - inpt = np.array([True, False, True, True, False, True, False , False], - dtype=np.bool) + inpt = np.array([True, False, True, True, False, True, False, False], + dtype=np.bool) outp = np.array(ujson.decode(ujson.encode(inpt)), dtype=np.bool) tm.assert_numpy_array_equal(inpt, outp) @@ -990,43 +999,56 @@ def testFloatArray(self): for dtype in dtypes: inpt = arr.astype(dtype) - outp = np.array(ujson.decode(ujson.encode(inpt, double_precision=15)), dtype=dtype) + outp = np.array(ujson.decode(ujson.encode( + inpt, double_precision=15)), dtype=dtype) assert_array_almost_equal_nulp(inpt, outp) def testFloatMax(self): - num = np.float(np.finfo(np.float).max/10) - assert_approx_equal(np.float(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + num = np.float(np.finfo(np.float).max / 10) + assert_approx_equal(np.float(ujson.decode( + ujson.encode(num, double_precision=15))), num, 15) - num = np.float32(np.finfo(np.float32).max/10) - assert_approx_equal(np.float32(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + num = np.float32(np.finfo(np.float32).max / 10) + assert_approx_equal(np.float32(ujson.decode( + ujson.encode(num, double_precision=15))), num, 15) - num = np.float64(np.finfo(np.float64).max/10) - assert_approx_equal(np.float64(ujson.decode(ujson.encode(num, double_precision=15))), num, 15) + num = np.float64(np.finfo(np.float64).max / 10) + assert_approx_equal(np.float64(ujson.decode( + ujson.encode(num, double_precision=15))), num, 15) def testArrays(self): arr = np.arange(100) arr = arr.reshape((10, 10)) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal( + np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode( + ujson.encode(arr), numpy=True), arr) arr = arr.reshape((5, 5, 4)) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal( + np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode( + ujson.encode(arr), numpy=True), arr) arr = arr.reshape((100, 1)) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal( + np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode( + ujson.encode(arr), numpy=True), arr) arr = np.arange(96) arr = arr.reshape((2, 2, 2, 2, 3, 2)) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) - tm.assert_numpy_array_equal(ujson.decode(ujson.encode(arr), numpy=True), arr) + tm.assert_numpy_array_equal( + np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal(ujson.decode( + ujson.encode(arr), numpy=True), arr) l = ['a', list(), dict(), dict(), list(), 42, 97.8, ['a', 'b'], {'key': 'val'}] arr = np.array(l) - tm.assert_numpy_array_equal(np.array(ujson.decode(ujson.encode(arr))), arr) + tm.assert_numpy_array_equal( + np.array(ujson.decode(ujson.encode(arr))), arr) arr = np.arange(100.202, 200.202, 1, dtype=np.float32) arr = arr.reshape((5, 5, 4)) @@ -1137,17 +1159,22 @@ def testArrayNumpyLabelled(self): self.assertTrue(output[1] is None) self.assertTrue((np.array([u('a')]) == output[2]).all()) - # Write out the dump explicitly so there is no dependency on iteration order GH10837 - input_dumps = '[{"a": 42, "b":31}, {"a": 24, "c": 99}, {"a": 2.4, "b": 78}]' + # Write out the dump explicitly so there is no dependency on iteration + # order GH10837 + input_dumps = ('[{"a": 42, "b":31}, {"a": 24, "c": 99}, ' + '{"a": 2.4, "b": 78}]') output = ujson.loads(input_dumps, numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expectedvals = np.array( + [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) self.assertTrue((expectedvals == output[0]).all()) self.assertTrue(output[1] is None) self.assertTrue((np.array([u('a'), 'b']) == output[2]).all()) - input_dumps = '{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, "3": {"a": 2.4, "b": 78}}' + input_dumps = ('{"1": {"a": 42, "b":31}, "2": {"a": 24, "c": 99}, ' + '"3": {"a": 2.4, "b": 78}}') output = ujson.loads(input_dumps, numpy=True, labelled=True) - expectedvals = np.array([42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) + expectedvals = np.array( + [42, 31, 24, 99, 2.4, 78], dtype=int).reshape((3, 2)) self.assertTrue((expectedvals == output[0]).all()) self.assertTrue((np.array(['1', '2', '3']) == output[1]).all()) self.assertTrue((np.array(['a', 'b']) == output[2]).all()) @@ -1156,7 +1183,8 @@ def testArrayNumpyLabelled(self): class PandasJSONTests(TestCase): def testDataFrame(self): - df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ + 'a', 'b'], columns=['x', 'y', 'z']) # column indexed outp = DataFrame(ujson.decode(ujson.encode(df))) @@ -1185,7 +1213,8 @@ def testDataFrame(self): tm.assert_numpy_array_equal(df.transpose().index, outp.index) def testDataFrameNumpy(self): - df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ + 'a', 'b'], columns=['x', 'y', 'z']) # column indexed outp = DataFrame(ujson.decode(ujson.encode(df), numpy=True)) @@ -1194,19 +1223,21 @@ def testDataFrameNumpy(self): tm.assert_numpy_array_equal(df.index, outp.index) dec = _clean_dict(ujson.decode(ujson.encode(df, orient="split"), - numpy=True)) + numpy=True)) outp = DataFrame(**dec) self.assertTrue((df == outp).values.all()) tm.assert_numpy_array_equal(df.columns, outp.columns) tm.assert_numpy_array_equal(df.index, outp.index) - outp = DataFrame(ujson.decode(ujson.encode(df, orient="index"), numpy=True)) + outp = DataFrame(ujson.decode( + ujson.encode(df, orient="index"), numpy=True)) self.assertTrue((df.transpose() == outp).values.all()) tm.assert_numpy_array_equal(df.transpose().columns, outp.columns) tm.assert_numpy_array_equal(df.transpose().index, outp.index) def testDataFrameNested(self): - df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ + 'a', 'b'], columns=['x', 'y', 'z']) nested = {'df1': df, 'df2': df.copy()} @@ -1216,41 +1247,50 @@ def testDataFrameNested(self): exp = {'df1': ujson.decode(ujson.encode(df, orient="index")), 'df2': ujson.decode(ujson.encode(df, orient="index"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="index")) == exp) exp = {'df1': ujson.decode(ujson.encode(df, orient="records")), 'df2': ujson.decode(ujson.encode(df, orient="records"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="records")) == exp) exp = {'df1': ujson.decode(ujson.encode(df, orient="values")), 'df2': ujson.decode(ujson.encode(df, orient="values"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="values")) == exp) exp = {'df1': ujson.decode(ujson.encode(df, orient="split")), 'df2': ujson.decode(ujson.encode(df, orient="split"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="split")) == exp) def testDataFrameNumpyLabelled(self): - df = DataFrame([[1,2,3], [4,5,6]], index=['a', 'b'], columns=['x', 'y', 'z']) + df = DataFrame([[1, 2, 3], [4, 5, 6]], index=[ + 'a', 'b'], columns=['x', 'y', 'z']) # column indexed - outp = DataFrame(*ujson.decode(ujson.encode(df), numpy=True, labelled=True)) + outp = DataFrame(*ujson.decode(ujson.encode(df), + numpy=True, labelled=True)) self.assertTrue((df.T == outp).values.all()) tm.assert_numpy_array_equal(df.T.columns, outp.columns) tm.assert_numpy_array_equal(df.T.index, outp.index) - outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), numpy=True, labelled=True)) + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="records"), + numpy=True, labelled=True)) outp.index = df.index self.assertTrue((df == outp).values.all()) tm.assert_numpy_array_equal(df.columns, outp.columns) - outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), numpy=True, labelled=True)) + outp = DataFrame(*ujson.decode(ujson.encode(df, orient="index"), + numpy=True, labelled=True)) self.assertTrue((df == outp).values.all()) tm.assert_numpy_array_equal(df.columns, outp.columns) tm.assert_numpy_array_equal(df.index, outp.index) def testSeries(self): - s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]).sort_values() + s = Series([10, 20, 30, 40, 50, 60], name="series", + index=[6, 7, 8, 9, 10, 15]).sort_values() # column indexed outp = Series(ujson.decode(ujson.encode(s))).sort_values() @@ -1265,31 +1305,36 @@ def testSeries(self): self.assertTrue(s.name == outp.name) dec = _clean_dict(ujson.decode(ujson.encode(s, orient="split"), - numpy=True)) + numpy=True)) outp = Series(**dec) self.assertTrue((s == outp).values.all()) self.assertTrue(s.name == outp.name) - outp = Series(ujson.decode(ujson.encode(s, orient="records"), numpy=True)) + outp = Series(ujson.decode(ujson.encode( + s, orient="records"), numpy=True)) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="records"))) self.assertTrue((s == outp).values.all()) - outp = Series(ujson.decode(ujson.encode(s, orient="values"), numpy=True)) + outp = Series(ujson.decode( + ujson.encode(s, orient="values"), numpy=True)) self.assertTrue((s == outp).values.all()) outp = Series(ujson.decode(ujson.encode(s, orient="values"))) self.assertTrue((s == outp).values.all()) - outp = Series(ujson.decode(ujson.encode(s, orient="index"))).sort_values() + outp = Series(ujson.decode(ujson.encode( + s, orient="index"))).sort_values() self.assertTrue((s == outp).values.all()) - outp = Series(ujson.decode(ujson.encode(s, orient="index"), numpy=True)).sort_values() + outp = Series(ujson.decode(ujson.encode( + s, orient="index"), numpy=True)).sort_values() self.assertTrue((s == outp).values.all()) def testSeriesNested(self): - s = Series([10, 20, 30, 40, 50, 60], name="series", index=[6,7,8,9,10,15]).sort_values() + s = Series([10, 20, 30, 40, 50, 60], name="series", + index=[6, 7, 8, 9, 10, 15]).sort_values() nested = {'s1': s, 's2': s.copy()} @@ -1299,19 +1344,23 @@ def testSeriesNested(self): exp = {'s1': ujson.decode(ujson.encode(s, orient="split")), 's2': ujson.decode(ujson.encode(s, orient="split"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="split")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="split")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="records")), 's2': ujson.decode(ujson.encode(s, orient="records"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="records")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="records")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="values")), 's2': ujson.decode(ujson.encode(s, orient="values"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="values")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="values")) == exp) exp = {'s1': ujson.decode(ujson.encode(s, orient="index")), 's2': ujson.decode(ujson.encode(s, orient="index"))} - self.assertTrue(ujson.decode(ujson.encode(nested, orient="index")) == exp) + self.assertTrue(ujson.decode( + ujson.encode(nested, orient="index")) == exp) def testIndex(self): i = Index([23, 45, 18, 98, 43, 11], name="index") @@ -1329,7 +1378,7 @@ def testIndex(self): self.assertTrue(i.name == outp.name) dec = _clean_dict(ujson.decode(ujson.encode(i, orient="split"), - numpy=True)) + numpy=True)) outp = Index(**dec) self.assertTrue(i.equals(outp)) self.assertTrue(i.name == outp.name) @@ -1337,13 +1386,15 @@ def testIndex(self): outp = Index(ujson.decode(ujson.encode(i, orient="values"))) self.assertTrue(i.equals(outp)) - outp = Index(ujson.decode(ujson.encode(i, orient="values"), numpy=True)) + outp = Index(ujson.decode(ujson.encode( + i, orient="values"), numpy=True)) self.assertTrue(i.equals(outp)) outp = Index(ujson.decode(ujson.encode(i, orient="records"))) self.assertTrue(i.equals(outp)) - outp = Index(ujson.decode(ujson.encode(i, orient="records"), numpy=True)) + outp = Index(ujson.decode(ujson.encode( + i, orient="records"), numpy=True)) self.assertTrue(i.equals(outp)) outp = Index(ujson.decode(ujson.encode(i, orient="index"))) @@ -1424,7 +1475,7 @@ def test_decodeTooBigValue(self): try: input = "9223372036854775808" ujson.decode(input) - except ValueError as e: + except ValueError: pass else: assert False, "expected ValueError" @@ -1433,7 +1484,7 @@ def test_decodeTooSmallValue(self): try: input = "-90223372036854775809" ujson.decode(input) - except ValueError as e: + except ValueError: pass else: assert False, "expected ValueError" @@ -1488,21 +1539,32 @@ def test_decodeArrayFaultyUnicode(self): def test_decodeFloatingPointAdditionalTests(self): places = 15 - self.assertAlmostEqual(-1.1234567893, ujson.loads("-1.1234567893"), places=places) - self.assertAlmostEqual(-1.234567893, ujson.loads("-1.234567893"), places=places) - self.assertAlmostEqual(-1.34567893, ujson.loads("-1.34567893"), places=places) - self.assertAlmostEqual(-1.4567893, ujson.loads("-1.4567893"), places=places) - self.assertAlmostEqual(-1.567893, ujson.loads("-1.567893"), places=places) - self.assertAlmostEqual(-1.67893, ujson.loads("-1.67893"), places=places) + self.assertAlmostEqual(-1.1234567893, + ujson.loads("-1.1234567893"), places=places) + self.assertAlmostEqual(-1.234567893, + ujson.loads("-1.234567893"), places=places) + self.assertAlmostEqual(-1.34567893, + ujson.loads("-1.34567893"), places=places) + self.assertAlmostEqual(-1.4567893, + ujson.loads("-1.4567893"), places=places) + self.assertAlmostEqual(-1.567893, + ujson.loads("-1.567893"), places=places) + self.assertAlmostEqual(-1.67893, + ujson.loads("-1.67893"), places=places) self.assertAlmostEqual(-1.7893, ujson.loads("-1.7893"), places=places) self.assertAlmostEqual(-1.893, ujson.loads("-1.893"), places=places) self.assertAlmostEqual(-1.3, ujson.loads("-1.3"), places=places) - self.assertAlmostEqual(1.1234567893, ujson.loads("1.1234567893"), places=places) - self.assertAlmostEqual(1.234567893, ujson.loads("1.234567893"), places=places) - self.assertAlmostEqual(1.34567893, ujson.loads("1.34567893"), places=places) - self.assertAlmostEqual(1.4567893, ujson.loads("1.4567893"), places=places) - self.assertAlmostEqual(1.567893, ujson.loads("1.567893"), places=places) + self.assertAlmostEqual(1.1234567893, ujson.loads( + "1.1234567893"), places=places) + self.assertAlmostEqual(1.234567893, ujson.loads( + "1.234567893"), places=places) + self.assertAlmostEqual( + 1.34567893, ujson.loads("1.34567893"), places=places) + self.assertAlmostEqual( + 1.4567893, ujson.loads("1.4567893"), places=places) + self.assertAlmostEqual( + 1.567893, ujson.loads("1.567893"), places=places) self.assertAlmostEqual(1.67893, ujson.loads("1.67893"), places=places) self.assertAlmostEqual(1.7893, ujson.loads("1.7893"), places=places) self.assertAlmostEqual(1.893, ujson.loads("1.893"), places=places) @@ -1519,7 +1581,7 @@ def test_encodeEmptySet(self): self.assertEqual("[]", ujson.encode(s)) def test_encodeSet(self): - s = set([1,2,3,4,5,6,7,8,9]) + s = set([1, 2, 3, 4, 5, 6, 7, 8, 9]) enc = ujson.encode(s) dec = ujson.decode(enc) @@ -1532,5 +1594,5 @@ def _clean_dict(d): if __name__ == '__main__': - nose.runmodule(argv=[__file__,'-vvs','-x','--pdb', '--pdb-failure'], + nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], exit=False) diff --git a/pandas/io/tests/test_json_norm.py b/pandas/io/tests/test_json_norm.py index 8084446d2d246..81a1fecbdebac 100644 --- a/pandas/io/tests/test_json_norm.py +++ b/pandas/io/tests/test_json_norm.py @@ -7,6 +7,7 @@ from pandas.io.json import json_normalize, nested_to_record + def _assert_equal_data(left, right): if not left.columns.equals(right.columns): left = left.reindex(columns=right.columns) @@ -18,17 +19,17 @@ class TestJSONNormalize(tm.TestCase): def setUp(self): self.state_data = [ - {'counties': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}], - 'info': {'governor': 'Rick Scott'}, - 'shortname': 'FL', - 'state': 'Florida'}, - {'counties': [{'name': 'Summit', 'population': 1234}, - {'name': 'Cuyahoga', 'population': 1337}], - 'info': {'governor': 'John Kasich'}, - 'shortname': 'OH', - 'state': 'Ohio'}] + {'counties': [{'name': 'Dade', 'population': 12345}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}], + 'info': {'governor': 'Rick Scott'}, + 'shortname': 'FL', + 'state': 'Florida'}, + {'counties': [{'name': 'Summit', 'population': 1234}, + {'name': 'Cuyahoga', 'population': 1337}], + 'info': {'governor': 'John Kasich'}, + 'shortname': 'OH', + 'state': 'Ohio'}] def test_simple_records(self): recs = [{'a': 1, 'b': 2, 'c': 3}, @@ -67,28 +68,28 @@ def test_more_deeply_nested(self): 'pop': 12345}, {'name': 'Los Angeles', 'pop': 12346}] - }, + }, {'name': 'Ohio', 'cities': [{'name': 'Columbus', 'pop': 1234}, {'name': 'Cleveland', 'pop': 1236}]} - ] + ] }, {'country': 'Germany', 'states': [{'name': 'Bayern', 'cities': [{'name': 'Munich', 'pop': 12347}] - }, + }, {'name': 'Nordrhein-Westfalen', 'cities': [{'name': 'Duesseldorf', 'pop': 1238}, {'name': 'Koeln', 'pop': 1239}]} - ] + ] } ] result = json_normalize(data, ['states', 'cities'], meta=['country', ['states', 'name']]) - # meta_prefix={'states': 'state_'}) + # meta_prefix={'states': 'state_'}) ex_data = {'country': ['USA'] * 4 + ['Germany'] * 3, 'states.name': ['California', 'California', 'Ohio', 'Ohio', @@ -105,15 +106,15 @@ def test_shallow_nested(self): data = [{'state': 'Florida', 'shortname': 'FL', 'info': { - 'governor': 'Rick Scott' + 'governor': 'Rick Scott' }, 'counties': [{'name': 'Dade', 'population': 12345}, - {'name': 'Broward', 'population': 40000}, - {'name': 'Palm Beach', 'population': 60000}]}, + {'name': 'Broward', 'population': 40000}, + {'name': 'Palm Beach', 'population': 60000}]}, {'state': 'Ohio', 'shortname': 'OH', 'info': { - 'governor': 'John Kasich' + 'governor': 'John Kasich' }, 'counties': [{'name': 'Summit', 'population': 1234}, {'name': 'Cuyahoga', 'population': 1337}]}] @@ -167,8 +168,8 @@ def test_record_prefix(self): class TestNestedToRecord(tm.TestCase): def test_flat_stays_flat(self): - recs = [dict(flat1=1,flat2=2), - dict(flat1=3,flat2=4), + recs = [dict(flat1=1, flat2=2), + dict(flat1=3, flat2=4), ] result = nested_to_record(recs) @@ -177,30 +178,30 @@ def test_flat_stays_flat(self): def test_one_level_deep_flattens(self): data = dict(flat1=1, - dict1=dict(c=1,d=2)) + dict1=dict(c=1, d=2)) result = nested_to_record(data) - expected = {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1} + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1} - self.assertEqual(result,expected) + self.assertEqual(result, expected) def test_nested_flattens(self): data = dict(flat1=1, - dict1=dict(c=1,d=2), - nested=dict(e=dict(c=1,d=2), + dict1=dict(c=1, d=2), + nested=dict(e=dict(c=1, d=2), d=2)) result = nested_to_record(data) - expected = {'dict1.c': 1, - 'dict1.d': 2, - 'flat1': 1, - 'nested.d': 2, - 'nested.e.c': 1, - 'nested.e.d': 2} - - self.assertEqual(result,expected) + expected = {'dict1.c': 1, + 'dict1.d': 2, + 'flat1': 1, + 'nested.d': 2, + 'nested.e.c': 1, + 'nested.e.d': 2} + + self.assertEqual(result, expected) if __name__ == '__main__': nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', diff --git a/pandas/io/tests/test_packers.py b/pandas/io/tests/test_packers.py index bdbcb9c0d0d3e..6905225600ae6 100644 --- a/pandas/io/tests/test_packers.py +++ b/pandas/io/tests/test_packers.py @@ -18,7 +18,6 @@ from pandas.tests.test_panel import assert_panel_equal import pandas -from pandas.sparse.tests.test_sparse import assert_sp_series_equal, assert_sp_frame_equal from pandas import Timestamp, tslib nan = np.nan @@ -57,6 +56,7 @@ def check_arbitrary(a, b): else: assert(a == b) + class TestPackers(tm.TestCase): def setUp(self): @@ -70,31 +70,32 @@ def encode_decode(self, x, compress=None, **kwargs): to_msgpack(p, x, compress=compress, **kwargs) return read_msgpack(p, **kwargs) + class TestAPI(TestPackers): def test_string_io(self): - df = DataFrame(np.random.randn(10,2)) + df = DataFrame(np.random.randn(10, 2)) s = df.to_msgpack(None) result = read_msgpack(s) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(s) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) s = df.to_msgpack() result = read_msgpack(compat.BytesIO(s)) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) - s = to_msgpack(None,df) + s = to_msgpack(None, df) result = read_msgpack(s) tm.assert_frame_equal(result, df) with ensure_clean(self.path) as p: s = df.to_msgpack() - fh = open(p,'wb') + fh = open(p, 'wb') fh.write(s) fh.close() result = read_msgpack(p) @@ -102,14 +103,15 @@ def test_string_io(self): def test_iterator_with_string_io(self): - dfs = [ DataFrame(np.random.randn(10,2)) for i in range(5) ] - s = to_msgpack(None,*dfs) - for i, result in enumerate(read_msgpack(s,iterator=True)): - tm.assert_frame_equal(result,dfs[i]) + dfs = [DataFrame(np.random.randn(10, 2)) for i in range(5)] + s = to_msgpack(None, *dfs) + for i, result in enumerate(read_msgpack(s, iterator=True)): + tm.assert_frame_equal(result, dfs[i]) def test_invalid_arg(self): - #GH10369 + # GH10369 class A(object): + def __init__(self): self.read = 0 @@ -123,7 +125,7 @@ class TestNumpy(TestPackers): def test_numpy_scalar_float(self): x = np.float32(np.random.rand()) x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_numpy_scalar_complex(self): x = np.complex64(np.random.rand() + 1j * np.random.rand()) @@ -133,7 +135,7 @@ def test_numpy_scalar_complex(self): def test_scalar_float(self): x = np.random.rand() x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_scalar_complex(self): x = np.random.rand() + 1j * np.random.rand() @@ -143,7 +145,7 @@ def test_scalar_complex(self): def test_list_numpy_float(self): x = [np.float32(np.random.rand()) for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_list_numpy_float_complex(self): if not hasattr(np, 'complex128'): @@ -158,7 +160,7 @@ def test_list_numpy_float_complex(self): def test_list_float(self): x = [np.random.rand() for i in range(5)] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_list_float_complex(self): x = [np.random.rand() for i in range(5)] + \ @@ -169,7 +171,7 @@ def test_list_float_complex(self): def test_dict_float(self): x = {'foo': 1.0, 'bar': 2.0} x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_dict_complex(self): x = {'foo': 1.0 + 1.0j, 'bar': 2.0 + 2.0j} @@ -181,7 +183,7 @@ def test_dict_complex(self): def test_dict_numpy_float(self): x = {'foo': np.float32(1.0), 'bar': np.float32(2.0)} x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_dict_numpy_complex(self): x = {'foo': np.complex128(1.0 + 1.0j), @@ -196,10 +198,10 @@ def test_numpy_array_float(self): # run multiple times for n in range(10): x = np.random.rand(10) - for dtype in ['float32','float64']: + for dtype in ['float32', 'float64']: x = x.astype(dtype) x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) def test_numpy_array_complex(self): x = (np.random.rand(5) + 1j * np.random.rand(5)).astype(np.complex128) @@ -210,7 +212,8 @@ def test_numpy_array_complex(self): def test_list_mixed(self): x = [1.0, np.float32(3.5), np.complex128(4.25), u('foo')] x_rec = self.encode_decode(x) - tm.assert_almost_equal(x,x_rec) + tm.assert_almost_equal(x, x_rec) + class TestBasic(TestPackers): @@ -229,9 +232,10 @@ def test_datetimes(self): if LooseVersion(sys.version) < '2.7': raise nose.SkipTest('2.6 with np.datetime64 is broken') - for i in [datetime.datetime( - 2013, 1, 1), datetime.datetime(2013, 1, 1, 5, 1), - datetime.date(2013, 1, 1), np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]: + for i in [datetime.datetime(2013, 1, 1), + datetime.datetime(2013, 1, 1, 5, 1), + datetime.date(2013, 1, 1), + np.datetime64(datetime.datetime(2013, 1, 5, 2, 15))]: i_rec = self.encode_decode(i) self.assertEqual(i, i_rec) @@ -263,8 +267,10 @@ def setUp(self): } self.mi = { - 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), ('foo', 'two'), - ('qux', 'one'), ('qux', 'two')], names=['first', 'second']), + 'reg': MultiIndex.from_tuples([('bar', 'one'), ('baz', 'two'), + ('foo', 'two'), + ('qux', 'one'), ('qux', 'two')], + names=['first', 'second']), } def test_basic_index(self): @@ -274,12 +280,13 @@ def test_basic_index(self): self.assertTrue(i.equals(i_rec)) # datetime with no freq (GH5506) - i = Index([Timestamp('20130101'),Timestamp('20130103')]) + i = Index([Timestamp('20130101'), Timestamp('20130103')]) i_rec = self.encode_decode(i) self.assertTrue(i.equals(i_rec)) # datetime with timezone - i = Index([Timestamp('20130101 9:00:00'),Timestamp('20130103 11:00:00')]).tz_localize('US/Eastern') + i = Index([Timestamp('20130101 9:00:00'), Timestamp( + '20130103 11:00:00')]).tz_localize('US/Eastern') i_rec = self.encode_decode(i) self.assertTrue(i.equals(i_rec)) @@ -295,8 +302,8 @@ def test_unicode(self): # this currently fails self.assertRaises(UnicodeEncodeError, self.encode_decode, i) - #i_rec = self.encode_decode(i) - #self.assertTrue(i.equals(i_rec)) + # i_rec = self.encode_decode(i) + # self.assertTrue(i.equals(i_rec)) class TestSeries(TestPackers): @@ -354,10 +361,12 @@ def setUp(self): self.frame = { 'float': DataFrame(dict(A=data['A'], B=Series(data['A']) + 1)), 'int': DataFrame(dict(A=data['B'], B=Series(data['B']) + 1)), - 'mixed': DataFrame(dict([(k, data[k]) for k in ['A', 'B', 'C', 'D']]))} + 'mixed': DataFrame(dict([(k, data[k]) + for k in ['A', 'B', 'C', 'D']]))} self.panel = { - 'float': Panel(dict(ItemA=self.frame['float'], ItemB=self.frame['float'] + 1))} + 'float': Panel(dict(ItemA=self.frame['float'], + ItemB=self.frame['float'] + 1))} def test_basic_frame(self): @@ -377,8 +386,8 @@ def test_multi(self): for k in self.frame.keys(): assert_frame_equal(self.frame[k], i_rec[k]) - l = tuple( - [self.frame['float'], self.frame['float'].A, self.frame['float'].B, None]) + l = tuple([self.frame['float'], self.frame['float'].A, + self.frame['float'].B, None]) l_rec = self.encode_decode(l) check_arbitrary(l, l_rec) @@ -415,7 +424,7 @@ def test_dataframe_duplicate_column_names(self): # GH 9618 expected_1 = DataFrame(columns=['a', 'a']) - expected_2 = DataFrame(columns=[1]*100) + expected_2 = DataFrame(columns=[1] * 100) expected_2.loc[0] = np.random.randn(100) expected_3 = DataFrame(columns=[1, 1]) expected_3.loc[0] = ['abc', np.nan] @@ -434,8 +443,8 @@ class TestSparse(TestPackers): def _check_roundtrip(self, obj, comparator, **kwargs): # currently these are not implemetned - #i_rec = self.encode_decode(obj) - #comparator(obj, i_rec, **kwargs) + # i_rec = self.encode_decode(obj) + # comparator(obj, i_rec, **kwargs) self.assertRaises(NotImplementedError, self.encode_decode, obj) def test_sparse_series(self): @@ -581,29 +590,30 @@ def test_readonly_axis_zlib_to_sql(self): class TestEncoding(TestPackers): - def setUp(self): - super(TestEncoding, self).setUp() - data = { - 'A': [compat.u('\u2019')] * 1000, - 'B': np.arange(1000, dtype=np.int32), - 'C': list(100 * 'abcdefghij'), - 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), - 'E': [datetime.timedelta(days=x) for x in range(1000)], - 'G': [400] * 1000 - } - self.frame = { - 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), - 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), - 'mixed': DataFrame(data), - } - self.utf_encodings = ['utf8', 'utf16', 'utf32'] - - def test_utf(self): - # GH10581 - for encoding in self.utf_encodings: - for frame in compat.itervalues(self.frame): - result = self.encode_decode(frame, encoding=encoding) - assert_frame_equal(result, frame) + + def setUp(self): + super(TestEncoding, self).setUp() + data = { + 'A': [compat.u('\u2019')] * 1000, + 'B': np.arange(1000, dtype=np.int32), + 'C': list(100 * 'abcdefghij'), + 'D': date_range(datetime.datetime(2015, 4, 1), periods=1000), + 'E': [datetime.timedelta(days=x) for x in range(1000)], + 'G': [400] * 1000 + } + self.frame = { + 'float': DataFrame(dict((k, data[k]) for k in ['A', 'A'])), + 'int': DataFrame(dict((k, data[k]) for k in ['B', 'B'])), + 'mixed': DataFrame(data), + } + self.utf_encodings = ['utf8', 'utf16', 'utf32'] + + def test_utf(self): + # GH10581 + for encoding in self.utf_encodings: + for frame in compat.itervalues(self.frame): + result = self.encode_decode(frame, encoding=encoding) + assert_frame_equal(result, frame) class TestMsgpack(): @@ -620,13 +630,15 @@ class TestMsgpack(): NOTE: TestMsgpack can't be a subclass of tm.Testcase to use test generator. http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class """ + def setUp(self): from pandas.io.tests.generate_legacy_storage_files import ( create_msgpack_data, create_data) self.data = create_msgpack_data() self.all_data = create_data() self.path = u('__%s__.msgpack' % tm.rands(10)) - self.minimum_structure = {'series': ['float', 'int', 'mixed', 'ts', 'mi', 'dup'], + self.minimum_structure = {'series': ['float', 'int', 'mixed', + 'ts', 'mi', 'dup'], 'frame': ['float', 'int', 'mixed', 'mi'], 'panel': ['float'], 'index': ['int', 'date', 'period'], @@ -636,15 +648,19 @@ def check_min_structure(self, data): for typ, v in self.minimum_structure.items(): assert typ in data, '"{0}" not found in unpacked data'.format(typ) for kind in v: - assert kind in data[typ], '"{0}" not found in data["{1}"]'.format(kind, typ) + assert kind in data[ + typ], '"{0}" not found in data["{1}"]'.format(kind, typ) def compare(self, vf, version): data = read_msgpack(vf) self.check_min_structure(data) for typ, dv in data.items(): - assert typ in self.all_data, 'unpacked data contains extra key "{0}"'.format(typ) + assert typ in self.all_data, ('unpacked data contains ' + 'extra key "{0}"' + .format(typ)) for dt, result in dv.items(): - assert dt in self.all_data[typ], 'data["{0}"] contains extra key "{1}"'.format(typ, dt) + assert dt in self.all_data[typ], ('data["{0}"] contains extra ' + 'key "{1}"'.format(typ, dt)) try: expected = self.data[typ][dt] except KeyError: @@ -652,7 +668,8 @@ def compare(self, vf, version): # use a specific comparator # if available - comparator = getattr(self,"compare_{typ}_{dt}".format(typ=typ,dt=dt), None) + comparator = getattr( + self, "compare_{typ}_{dt}".format(typ=typ, dt=dt), None) if comparator is not None: comparator(result, expected, typ, version) else: @@ -697,9 +714,3 @@ def test_msgpack(self): yield self.read_msgpacks, v n += 1 assert n > 0, 'Msgpack files are not tested' - - -if __name__ == '__main__': - import nose - nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], - exit=False) diff --git a/pandas/io/tests/test_parsers.py b/pandas/io/tests/test_parsers.py index e34f2cb87a2df..06afd20071349 100755 --- a/pandas/io/tests/test_parsers.py +++ b/pandas/io/tests/test_parsers.py @@ -1,6 +1,8 @@ # -*- coding: utf-8 -*- # pylint: disable=E1101 +# flake8: noqa + from datetime import datetime import csv import os @@ -71,13 +73,6 @@ def test_converters_type_must_be_dict(self): with tm.assertRaisesRegexp(TypeError, 'Type converters.+'): self.read_csv(StringIO(self.data1), converters=0) - def test_multi_character_decimal_marker(self): - data = """A|B|C -1|2,334|5 -10|13|10. -""" - self.assertRaises(ValueError, read_csv, StringIO(data), decimal=',,') - def test_empty_decimal_marker(self): data = """A|B|C 1|2,334|5 @@ -92,7 +87,6 @@ def test_empty_thousands_marker(self): """ self.assertRaises(ValueError, read_csv, StringIO(data), thousands='') - def test_multi_character_decimal_marker(self): data = """A|B|C 1|2,334|5 @@ -141,8 +135,8 @@ def test_empty_string(self): np.nan, 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - - # GH4318, passing na_values=None and keep_default_na=False yields 'None' as a na_value + # GH4318, passing na_values=None and keep_default_na=False yields + # 'None' as a na_value data = """\ One,Two,Three a,1,None @@ -161,7 +155,6 @@ def test_empty_string(self): 'seven']}) tm.assert_frame_equal(xp.reindex(columns=df.columns), df) - def test_read_csv(self): if not compat.PY3: if compat.is_platform_windows(): @@ -170,7 +163,7 @@ def test_read_csv(self): prefix = u("file://") fname = prefix + compat.text_type(self.csv1) # it works! - df1 = read_csv(fname, index_col=0, parse_dates=True) + read_csv(fname, index_col=0, parse_dates=True) def test_dialect(self): data = """\ @@ -202,7 +195,7 @@ def test_dialect_str(self): 'fruit': ['apple', 'pear'], 'vegetable': ['brocolli', 'tomato'] }) - dia = csv.register_dialect('mydialect', delimiter=':') + dia = csv.register_dialect('mydialect', delimiter=':') # noqa df = self.read_csv(StringIO(data), dialect='mydialect') tm.assert_frame_equal(df, exp) csv.unregister_dialect('mydialect') @@ -242,17 +235,20 @@ def test_1000_sep_with_decimal(self): df = self.read_csv(StringIO(data), sep='|', thousands=',', decimal='.') tm.assert_frame_equal(df, expected) - df = self.read_table(StringIO(data), sep='|', thousands=',', decimal='.') + df = self.read_table(StringIO(data), sep='|', + thousands=',', decimal='.') tm.assert_frame_equal(df, expected) data_with_odd_sep = """A|B|C 1|2.334,01|5 10|13|10, """ - df = self.read_csv(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + df = self.read_csv(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) - df = self.read_table(StringIO(data_with_odd_sep), sep='|', thousands='.', decimal=',') + df = self.read_table(StringIO(data_with_odd_sep), + sep='|', thousands='.', decimal=',') tm.assert_frame_equal(df, expected) def test_separator_date_conflict(self): @@ -264,7 +260,8 @@ def test_separator_date_conflict(self): columns=['Date', 2] ) - df = self.read_csv(StringIO(data), sep=';', thousands='-', parse_dates={'Date': [0, 1]}, header=None) + df = self.read_csv(StringIO(data), sep=';', thousands='-', + parse_dates={'Date': [0, 1]}, header=None) tm.assert_frame_equal(df, expected) def test_squeeze(self): @@ -410,7 +407,7 @@ def test_multiple_date_col_timestamp_parse(self): data = """05/31/2012,15:30:00.029,1306.25,1,E,0,,1306.25 05/31/2012,15:30:00.029,1306.25,8,E,0,,1306.25""" result = self.read_csv(StringIO(data), sep=',', header=None, - parse_dates=[[0,1]], date_parser=Timestamp) + parse_dates=[[0, 1]], date_parser=Timestamp) ex_val = Timestamp('05/31/2012 15:30:00.029') self.assertEqual(result['0_1'][0], ex_val) @@ -471,7 +468,7 @@ def test_multiple_date_col_name_collision(self): KORD3,19990127, 21:00:00, 20:56:00, -0.5900, 2.2100, 5.7000, 0.0000, 280.0000 KORD4,19990127, 21:00:00, 21:18:00, -0.9900, 2.0100, 3.6000, 0.0000, 270.0000 KORD5,19990127, 22:00:00, 21:56:00, -0.5900, 1.7100, 5.1000, 0.0000, 290.0000 -KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" +KORD6,19990127, 23:00:00, 22:56:00, -0.5900, 1.7100, 4.6000, 0.0000, 280.0000""" # noqa self.assertRaises(ValueError, self.read_csv, StringIO(data), parse_dates=[[1, 2]]) @@ -515,11 +512,12 @@ def test_usecols_index_col_False(self): # Issue 9082 s = "a,b,c,d\n1,2,3,4\n5,6,7,8" s_malformed = "a,b,c,d\n1,2,3,4,\n5,6,7,8," - cols = ['a','c','d'] - expected = DataFrame({'a':[1,5], 'c':[3,7], 'd':[4,8]}) + cols = ['a', 'c', 'd'] + expected = DataFrame({'a': [1, 5], 'c': [3, 7], 'd': [4, 8]}) df = self.read_csv(StringIO(s), usecols=cols, index_col=False) tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(s_malformed), usecols=cols, index_col=False) + df = self.read_csv(StringIO(s_malformed), + usecols=cols, index_col=False) tm.assert_frame_equal(expected, df) def test_index_col_is_True(self): @@ -542,9 +540,9 @@ def test_date_parser_int_bug(self): # #3071 log_file = StringIO( 'posix_timestamp,elapsed,sys,user,queries,query_time,rows,' - 'accountid,userid,contactid,level,silo,method\n' + 'accountid,userid,contactid,level,silo,method\n' '1343103150,0.062353,0,4,6,0.01690,3,' - '12345,1,-1,3,invoice_InvoiceResource,search\n' + '12345,1,-1,3,invoice_InvoiceResource,search\n' ) def f(posix_string): @@ -588,7 +586,7 @@ def test_malformed(self): # Test for ValueError with other engines: try: - with tm.assertRaisesRegexp(ValueError, 'skip_footer'): #XXX + with tm.assertRaisesRegexp(ValueError, 'skip_footer'): # XXX df = self.read_table( StringIO(data), sep=',', header=1, comment='#', skip_footer=1) @@ -607,7 +605,8 @@ def test_malformed(self): """ try: it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', iterator=True, chunksize=1, + header=1, comment='#', + iterator=True, chunksize=1, skiprows=[2]) df = it.read(5) self.assertTrue(False) @@ -644,8 +643,8 @@ def test_malformed(self): """ try: it = self.read_table(StringIO(data), sep=',', - header=1, comment='#', iterator=True, chunksize=1, - skiprows=[2]) + header=1, comment='#', + iterator=True, chunksize=1, skiprows=[2]) df = it.read(1) it.read() self.assertTrue(False) @@ -659,9 +658,10 @@ def test_passing_dtype(self): # Test for ValueError with other engines: with tm.assertRaisesRegexp(ValueError, - "The 'dtype' option is not supported"): + "The 'dtype' option is not supported"): - df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) @@ -669,24 +669,30 @@ def test_passing_dtype(self): # GH 3795 # passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) - # we expect all object columns, so need to convert to test for equivalence + # we expect all object columns, so need to convert to test for + # equivalence result = result.astype(float) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'foo', 'B': 'float64'}, index_col=0) # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0, parse_dates=['B']) # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, + dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) def test_quoting(self): @@ -706,65 +712,72 @@ def test_quoting(self): def test_non_string_na_values(self): # GH3611, na_values that are not a string are an issue with tm.ensure_clean('__non_string_na_values__.csv') as path: - df = DataFrame({'A' : [-999, 2, 3], 'B' : [1.2, -999, 4.5]}) + df = DataFrame({'A': [-999, 2, 3], 'B': [1.2, -999, 4.5]}) df.to_csv(path, sep=' ', index=False) - result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) - result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0]) - result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999]) - tm.assert_frame_equal(result1,result2) - tm.assert_frame_equal(result2,result3) - - result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) - result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) - result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) - result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4,result3) - tm.assert_frame_equal(result5,result3) - tm.assert_frame_equal(result6,result3) - tm.assert_frame_equal(result7,result3) + result1 = read_csv(path, sep=' ', header=0, + na_values=['-999.0', '-999']) + result2 = read_csv(path, sep=' ', header=0, + na_values=[-999, -999.0]) + result3 = read_csv(path, sep=' ', header=0, + na_values=[-999.0, -999]) + tm.assert_frame_equal(result1, result2) + tm.assert_frame_equal(result2, result3) + + result4 = read_csv(path, sep=' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep=' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep=' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep=' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4, result3) + tm.assert_frame_equal(result5, result3) + tm.assert_frame_equal(result6, result3) + tm.assert_frame_equal(result7, result3) good_compare = result3 - # with an odd float format, so we can't match the string 999.0 exactly, - # but need float matching - df.to_csv(path, sep=' ', index=False, float_format = '%.3f') - result1 = read_csv(path, sep= ' ', header=0, na_values=['-999.0','-999']) - result2 = read_csv(path, sep= ' ', header=0, na_values=[-999,-999.0]) - result3 = read_csv(path, sep= ' ', header=0, na_values=[-999.0,-999]) - tm.assert_frame_equal(result1,good_compare) - tm.assert_frame_equal(result2,good_compare) - tm.assert_frame_equal(result3,good_compare) - - result4 = read_csv(path, sep= ' ', header=0, na_values=['-999.0']) - result5 = read_csv(path, sep= ' ', header=0, na_values=['-999']) - result6 = read_csv(path, sep= ' ', header=0, na_values=[-999.0]) - result7 = read_csv(path, sep= ' ', header=0, na_values=[-999]) - tm.assert_frame_equal(result4,good_compare) - tm.assert_frame_equal(result5,good_compare) - tm.assert_frame_equal(result6,good_compare) - tm.assert_frame_equal(result7,good_compare) + # with an odd float format, so we can't match the string 999.0 + # exactly, but need float matching + df.to_csv(path, sep=' ', index=False, float_format='%.3f') + result1 = read_csv(path, sep=' ', header=0, + na_values=['-999.0', '-999']) + result2 = read_csv(path, sep=' ', header=0, + na_values=[-999, -999.0]) + result3 = read_csv(path, sep=' ', header=0, + na_values=[-999.0, -999]) + tm.assert_frame_equal(result1, good_compare) + tm.assert_frame_equal(result2, good_compare) + tm.assert_frame_equal(result3, good_compare) + + result4 = read_csv(path, sep=' ', header=0, na_values=['-999.0']) + result5 = read_csv(path, sep=' ', header=0, na_values=['-999']) + result6 = read_csv(path, sep=' ', header=0, na_values=[-999.0]) + result7 = read_csv(path, sep=' ', header=0, na_values=[-999]) + tm.assert_frame_equal(result4, good_compare) + tm.assert_frame_equal(result5, good_compare) + tm.assert_frame_equal(result6, good_compare) + tm.assert_frame_equal(result7, good_compare) def test_default_na_values(self): _NA_VALUES = set(['-1.#IND', '1.#QNAN', '1.#IND', '-1.#QNAN', - '#N/A','N/A', 'NA', '#NA', 'NULL', 'NaN', - 'nan', '-NaN', '-nan', '#N/A N/A','']) + '#N/A', 'N/A', 'NA', '#NA', 'NULL', 'NaN', + 'nan', '-NaN', '-nan', '#N/A N/A', '']) self.assertEqual(_NA_VALUES, parsers._NA_VALUES) nv = len(_NA_VALUES) + def f(i, v): if i == 0: buf = '' elif i > 0: buf = ''.join([','] * i) - buf = "{0}{1}".format(buf,v) + buf = "{0}{1}".format(buf, v) - if i < nv-1: - buf = "{0}{1}".format(buf,''.join([','] * (nv-i-1))) + if i < nv - 1: + buf = "{0}{1}".format(buf, ''.join([','] * (nv - i - 1))) return buf - data = StringIO('\n'.join([ f(i, v) for i, v in enumerate(_NA_VALUES) ])) - expected = DataFrame(np.nan,columns=range(nv),index=range(nv)) + data = StringIO('\n'.join([f(i, v) for i, v in enumerate(_NA_VALUES)])) + expected = DataFrame(np.nan, columns=range(nv), index=range(nv)) df = self.read_csv(data, header=None) tm.assert_frame_equal(df, expected) @@ -794,23 +807,24 @@ def test_nat_parse(self): # GH 3062 df = DataFrame(dict({ - 'A' : np.asarray(lrange(10),dtype='float64'), - 'B' : pd.Timestamp('20010101') })) - df.iloc[3:6,:] = np.nan + 'A': np.asarray(lrange(10), dtype='float64'), + 'B': pd.Timestamp('20010101')})) + df.iloc[3:6, :] = np.nan with tm.ensure_clean('__nat_parse_.csv') as path: df.to_csv(path) - result = read_csv(path,index_col=0,parse_dates=['B']) - tm.assert_frame_equal(result,df) + result = read_csv(path, index_col=0, parse_dates=['B']) + tm.assert_frame_equal(result, df) - expected = Series(dict( A = 'float64',B = 'datetime64[ns]')) - tm.assert_series_equal(expected,result.dtypes) + expected = Series(dict(A='float64', B='datetime64[ns]')) + tm.assert_series_equal(expected, result.dtypes) # test with NaT for the nan_rep - # we don't have a method to specif the Datetime na_rep (it defaults to '') + # we don't have a method to specif the Datetime na_rep (it defaults + # to '') df.to_csv(path) - result = read_csv(path,index_col=0,parse_dates=['B']) - tm.assert_frame_equal(result,df) + result = read_csv(path, index_col=0, parse_dates=['B']) + tm.assert_frame_equal(result, df) def test_skiprows_bug(self): # GH #505 @@ -840,8 +854,12 @@ def test_skiprows_bug(self): def test_deep_skiprows(self): # GH #4382 - text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in range(10)]) - condensed_text = "a,b,c\n" + "\n".join([",".join([str(i), str(i+1), str(i+2)]) for i in [0, 1, 2, 3, 4, 6, 8, 9]]) + text = "a,b,c\n" + \ + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) + for i in range(10)]) + condensed_text = "a,b,c\n" + \ + "\n".join([",".join([str(i), str(i + 1), str(i + 2)]) + for i in [0, 1, 2, 3, 4, 6, 8, 9]]) data = self.read_csv(StringIO(text), skiprows=[6, 8]) condensed_data = self.read_csv(StringIO(condensed_text)) tm.assert_frame_equal(data, condensed_data) @@ -859,7 +877,7 @@ def test_skiprows_blank(self): 1/3/2000,7,8,9 """ data = self.read_csv(StringIO(text), skiprows=6, header=None, - index_col=0, parse_dates=True) + index_col=0, parse_dates=True) expected = DataFrame(np.arange(1., 10.).reshape((3, 3)), columns=[1, 2, 3], @@ -918,13 +936,15 @@ def test_duplicate_columns(self): 11,12,13,14,15 """ # check default beahviour - df = self.read_table(StringIO(data), sep=',',engine=engine) + df = self.read_table(StringIO(data), sep=',', engine=engine) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) - df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=False) + df = self.read_table(StringIO(data), sep=',', + engine=engine, mangle_dupe_cols=False) self.assertEqual(list(df.columns), ['A', 'A', 'B', 'B', 'B']) - df = self.read_table(StringIO(data), sep=',',engine=engine,mangle_dupe_cols=True) + df = self.read_table(StringIO(data), sep=',', + engine=engine, mangle_dupe_cols=True) self.assertEqual(list(df.columns), ['A', 'A.1', 'B', 'B.1', 'B.2']) def test_csv_mixed_type(self): @@ -955,7 +975,8 @@ def test_parse_dates_implicit_first_col(self): """ df = self.read_csv(StringIO(data), parse_dates=True) expected = self.read_csv(StringIO(data), index_col=0, parse_dates=True) - self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) tm.assert_frame_equal(df, expected) def test_parse_dates_string(self): @@ -1087,7 +1108,8 @@ def test_read_csv_dataframe(self): parse_dates=True) self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D']) self.assertEqual(df.index.name, 'index') - self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) self.assertEqual(df.values.dtype, np.float64) tm.assert_frame_equal(df, df2) @@ -1096,8 +1118,10 @@ def test_read_csv_no_index_name(self): df2 = self.read_table(self.csv2, sep=',', index_col=0, parse_dates=True) self.assert_numpy_array_equal(df.columns, ['A', 'B', 'C', 'D', 'E']) - self.assertIsInstance(df.index[0], (datetime, np.datetime64, Timestamp)) - self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D']].values.dtype, np.float64) + self.assertIsInstance( + df.index[0], (datetime, np.datetime64, Timestamp)) + self.assertEqual(df.ix[:, ['A', 'B', 'C', 'D'] + ].values.dtype, np.float64) tm.assert_frame_equal(df, df2) def test_read_csv_infer_compression(self): @@ -1109,7 +1133,7 @@ def test_read_csv_infer_compression(self): for f in inputs: df = self.read_csv(f, index_col=0, parse_dates=True, - compression='infer') + compression='infer') tm.assert_frame_equal(expected, df) @@ -1311,13 +1335,15 @@ def test_iterator(self): """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) @@ -1340,7 +1366,8 @@ def test_header_not_first_line(self): tm.assert_frame_equal(df, expected) def test_header_multi_index(self): - expected = tm.makeCustomDataframe(5,3,r_idx_nlevels=2,c_idx_nlevels=4) + expected = tm.makeCustomDataframe( + 5, 3, r_idx_nlevels=2, c_idx_nlevels=4) data = """\ C0,,C_l0_g0,C_l0_g1,C_l0_g2 @@ -1356,35 +1383,37 @@ def test_header_multi_index(self): R_l0_g4,R_l1_g4,R4C0,R4C1,R4C2 """ - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ + 0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) # skipping lines in the header - df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[0, 1], tupleize_cols=False) + df = self.read_csv(StringIO(data), header=[0, 1, 2, 3], index_col=[ + 0, 1], tupleize_cols=False) tm.assert_frame_equal(df, expected) #### invalid options #### # no as_recarray - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], - index_col=[0,1], as_recarray=True, tupleize_cols=False) + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], as_recarray=True, tupleize_cols=False) # names - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], - index_col=[0,1], names=['foo','bar'], tupleize_cols=False) + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], names=['foo', 'bar'], tupleize_cols=False) # usecols - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], - index_col=[0,1], usecols=['foo','bar'], tupleize_cols=False) + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], + index_col=[0, 1], usecols=['foo', 'bar'], tupleize_cols=False) # non-numeric index_col - self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0,1,2,3], - index_col=['foo','bar'], tupleize_cols=False) + self.assertRaises(ValueError, self.read_csv, StringIO(data), header=[0, 1, 2, 3], + index_col=['foo', 'bar'], tupleize_cols=False) def test_header_multiindex_common_format(self): - df = DataFrame([[1,2,3,4,5,6],[7,8,9,10,11,12]], - index=['one','two'], - columns=MultiIndex.from_tuples([('a','q'),('a','r'),('a','s'), - ('b','t'),('c','u'),('c','v')])) + df = DataFrame([[1, 2, 3, 4, 5, 6], [7, 8, 9, 10, 11, 12]], + index=['one', 'two'], + columns=MultiIndex.from_tuples([('a', 'q'), ('a', 'r'), ('a', 's'), + ('b', 't'), ('c', 'u'), ('c', 'v')])) # to_csv data = """,a,a,a,b,c,c @@ -1393,8 +1422,8 @@ def test_header_multiindex_common_format(self): one,1,2,3,4,5,6 two,7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=0) - tm.assert_frame_equal(df,result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) # common data = """,a,a,a,b,c,c @@ -1402,8 +1431,8 @@ def test_header_multiindex_common_format(self): one,1,2,3,4,5,6 two,7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=0) - tm.assert_frame_equal(df,result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(df, result) # common, no index_col data = """a,a,a,b,c,c @@ -1411,15 +1440,16 @@ def test_header_multiindex_common_format(self): 1,2,3,4,5,6 7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=None) - tm.assert_frame_equal(df.reset_index(drop=True),result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=None) + tm.assert_frame_equal(df.reset_index(drop=True), result) # malformed case 1 expected = DataFrame(np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + labels=[[0, 0, 1, 2, 2], [ + 0, 1, 2, 3, 4]], names=[u('a'), u('q')])) data = """a,a,a,b,c,c @@ -1427,15 +1457,16 @@ def test_header_multiindex_common_format(self): 1,2,3,4,5,6 7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=0) - tm.assert_frame_equal(expected,result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) # malformed case 2 expected = DataFrame(np.array([[2, 3, 4, 5, 6], [8, 9, 10, 11, 12]], dtype='int64'), index=Index([1, 7]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('r'), u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 0, 1, 2, 2], [0, 1, 2, 3, 4]], + labels=[[0, 0, 1, 2, 2], [ + 0, 1, 2, 3, 4]], names=[None, u('q')])) data = """,a,a,b,c,c @@ -1443,16 +1474,17 @@ def test_header_multiindex_common_format(self): 1,2,3,4,5,6 7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=0) - tm.assert_frame_equal(expected,result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=0) + tm.assert_frame_equal(expected, result) # mi on columns and index (malformed) - expected = DataFrame(np.array([[ 3, 4, 5, 6], - [ 9, 10, 11, 12]], dtype='int64'), + expected = DataFrame(np.array([[3, 4, 5, 6], + [9, 10, 11, 12]], dtype='int64'), index=MultiIndex(levels=[[1, 7], [2, 8]], labels=[[0, 1], [0, 1]]), columns=MultiIndex(levels=[[u('a'), u('b'), u('c')], [u('s'), u('t'), u('u'), u('v')]], - labels=[[0, 1, 2, 2], [0, 1, 2, 3]], + labels=[[0, 1, 2, 2], + [0, 1, 2, 3]], names=[None, u('q')])) data = """,a,a,b,c,c @@ -1460,8 +1492,8 @@ def test_header_multiindex_common_format(self): 1,2,3,4,5,6 7,8,9,10,11,12""" - result = self.read_csv(StringIO(data),header=[0,1],index_col=[0, 1]) - tm.assert_frame_equal(expected,result) + result = self.read_csv(StringIO(data), header=[0, 1], index_col=[0, 1]) + tm.assert_frame_equal(expected, result) def test_pass_names_with_index(self): lines = self.data1.split('\n') @@ -1795,7 +1827,7 @@ def test_na_value_dict(self): def test_url(self): # HTTP(S) url = ('https://raw.github.com/pydata/pandas/master/' - 'pandas/io/tests/data/salary.table') + 'pandas/io/tests/data/salary.table') url_table = self.read_table(url) dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table') @@ -1865,7 +1897,7 @@ def test_multiple_date_cols_chunked(self): df = self.read_csv(StringIO(self.ts_data), parse_dates={ 'nominal': [1, 2]}, index_col='nominal') reader = self.read_csv(StringIO(self.ts_data), parse_dates={'nominal': - [1, 2]}, index_col='nominal', chunksize=2) + [1, 2]}, index_col='nominal', chunksize=2) chunks = list(reader) @@ -2151,24 +2183,31 @@ def test_usecols_index_col_conflict(self): 10000,2013-5-11,100,10,1 500,2013-5-12,101,11,1 """ - expected = DataFrame({'Price': [100, 101]}, index=[datetime(2013, 5, 11), datetime(2013, 5, 12)]) + expected = DataFrame({'Price': [100, 101]}, index=[ + datetime(2013, 5, 11), datetime(2013, 5, 12)]) expected.index.name = 'Time' - df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[ + 'Time', 'Price'], parse_dates=True, index_col=0) tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=['Time', 'Price'], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[ + 'Time', 'Price'], parse_dates=True, index_col='Time') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col='Time') + df = self.read_csv(StringIO(data), usecols=[ + 1, 2], parse_dates=True, index_col='Time') tm.assert_frame_equal(expected, df) - df = self.read_csv(StringIO(data), usecols=[1, 2], parse_dates=True, index_col=0) + df = self.read_csv(StringIO(data), usecols=[ + 1, 2], parse_dates=True, index_col=0) tm.assert_frame_equal(expected, df) - expected = DataFrame({'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) + expected = DataFrame( + {'P3': [1, 1], 'Price': (100, 101), 'P2': (10, 11)}) expected = expected.set_index(['Price', 'P2']) - df = self.read_csv(StringIO(data), usecols=['Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) + df = self.read_csv(StringIO(data), usecols=[ + 'Price', 'P2', 'P3'], parse_dates=True, index_col=['Price', 'P2']) tm.assert_frame_equal(expected, df) def test_chunks_have_consistent_numerical_type(self): @@ -2177,7 +2216,8 @@ def test_chunks_have_consistent_numerical_type(self): with tm.assert_produces_warning(False): df = self.read_csv(StringIO(data)) - self.assertTrue(type(df.a[0]) is np.float64) # Assert that types were coerced. + # Assert that types were coerced. + self.assertTrue(type(df.a[0]) is np.float64) self.assertEqual(df.a.dtype, np.float) def test_warn_if_chunks_have_mismatched_type(self): @@ -2230,7 +2270,6 @@ def test_usecols(self): header=None, usecols=['b', 'c']) tm.assert_frame_equal(result2, result) - # 5766 result = self.read_csv(StringIO(data), names=['a', 'b'], header=None, usecols=[0, 1]) @@ -2261,19 +2300,20 @@ def test_catch_too_many_names(self): 4,,6 7,8,9 10,11,12\n""" - tm.assertRaises(Exception, read_csv, StringIO(data), header=0, names=['a', 'b', 'c', 'd']) + tm.assertRaises(Exception, read_csv, StringIO(data), + header=0, names=['a', 'b', 'c', 'd']) def test_ignore_leading_whitespace(self): # GH 6607, GH 3374 data = ' a b c\n 1 2 3\n 4 5 6\n 7 8 9' result = self.read_table(StringIO(data), sep='\s+') - expected = DataFrame({'a':[1,4,7], 'b':[2,5,8], 'c': [3,6,9]}) + expected = DataFrame({'a': [1, 4, 7], 'b': [2, 5, 8], 'c': [3, 6, 9]}) tm.assert_frame_equal(result, expected) def test_nrows_and_chunksize_raises_notimplemented(self): data = 'a b c' self.assertRaises(NotImplementedError, self.read_csv, StringIO(data), - nrows=10, chunksize=5) + nrows=10, chunksize=5) def test_single_char_leading_whitespace(self): # GH 9710 @@ -2284,7 +2324,7 @@ def test_single_char_leading_whitespace(self): a b\n""" - expected = DataFrame({'MyColumn' : list('abab')}) + expected = DataFrame({'MyColumn': list('abab')}) result = self.read_csv(StringIO(data), skipinitialspace=True) tm.assert_frame_equal(result, expected) @@ -2329,27 +2369,37 @@ def test_empty_index_col_scenarios(self): # None, no index index_col, expected = None, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # False, no index index_col, expected = False, DataFrame([], columns=list('xyz')), - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # int, first column - index_col, expected = 0, DataFrame([], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + index_col, expected = 0, DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # int, not first column - index_col, expected = 1, DataFrame([], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + index_col, expected = 1, DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # str, first column - index_col, expected = 'x', DataFrame([], columns=['y', 'z'], index=Index([], name='x')) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + index_col, expected = 'x', DataFrame( + [], columns=['y', 'z'], index=Index([], name='x')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # str, not the first column - index_col, expected = 'y', DataFrame([], columns=['x', 'z'], index=Index([], name='y')) - tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected) + index_col, expected = 'y', DataFrame( + [], columns=['x', 'z'], index=Index([], name='y')) + tm.assert_frame_equal(self.read_csv( + StringIO(data), index_col=index_col), expected) # list of int index_col, expected = [0, 1], DataFrame([], columns=['z'], @@ -2359,19 +2409,22 @@ def test_empty_index_col_scenarios(self): # list of str index_col = ['x', 'y'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays([[]] * 2, names=['x', 'y'])) + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['x', 'y'])) tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, check_index_type=False) # list of int, reversed sequence index_col = [1, 0] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays([[]] * 2, names=['y', 'x'])) + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, check_index_type=False) # list of str, reversed sequence index_col = ['y', 'x'] - expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays([[]] * 2, names=['y', 'x'])) + expected = DataFrame([], columns=['z'], index=MultiIndex.from_arrays( + [[]] * 2, names=['y', 'x'])) tm.assert_frame_equal(self.read_csv(StringIO(data), index_col=index_col), expected, check_index_type=False) @@ -2403,8 +2456,8 @@ def test_int64_overflow(self): self.assertTrue(result['ID'].dtype == object) self.assertRaises((OverflowError, pandas.parser.OverflowError), - self.read_csv, StringIO(data), - converters={'ID' : np.int64}) + self.read_csv, StringIO(data), + converters={'ID': np.int64}) # Just inside int64 range: parse as integer i_max = np.iinfo(np.int64).max @@ -2434,16 +2487,19 @@ def test_empty_with_nrows_chunksize(self): result = pd.read_csv(StringIO('foo,bar\n'), nrows=10, as_recarray=True) result = pd.DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(pd.DataFrame.from_records(result), expected, check_index_type=False) + tm.assert_frame_equal(pd.DataFrame.from_records( + result), expected, check_index_type=False) - result = next(iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) + result = next( + iter(pd.read_csv(StringIO('foo,bar\n'), chunksize=10, as_recarray=True))) result = pd.DataFrame(result[2], columns=result[1], index=result[0]) - tm.assert_frame_equal(pd.DataFrame.from_records(result), expected, check_index_type=False) + tm.assert_frame_equal(pd.DataFrame.from_records( + result), expected, check_index_type=False) def test_eof_states(self): # GH 10728 and 10548 - ## With skip_blank_lines = True + # With skip_blank_lines = True expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) # GH 10728 @@ -2473,43 +2529,49 @@ def test_eof_states(self): result = self.read_csv(StringIO(data), skiprows=[2]) tm.assert_frame_equal(result, expected) - ## With skip_blank_lines = False + # With skip_blank_lines = False # EAT_LINE_COMMENT data = 'a,b,c\n4,5,6\n#comment' - result = self.read_csv(StringIO(data), comment='#', skip_blank_lines=False) + result = self.read_csv( + StringIO(data), comment='#', skip_blank_lines=False) expected = pd.DataFrame([[4, 5, 6]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # IN_FIELD data = 'a,b,c\n4,5,6\n ' result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = pd.DataFrame([['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) + expected = pd.DataFrame( + [['4', 5, 6], [' ', None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) # EAT_CRNL data = 'a,b,c\n4,5,6\n\r' result = self.read_csv(StringIO(data), skip_blank_lines=False) - expected = pd.DataFrame([[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) + expected = pd.DataFrame( + [[4, 5, 6], [None, None, None]], columns=['a', 'b', 'c']) tm.assert_frame_equal(result, expected) - ## Should produce exceptions + # Should produce exceptions # ESCAPED_CHAR data = "a,b,c\n4,5,6\n\\" - self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') # ESCAPE_IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"\\' - self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') # IN_QUOTED_FIELD data = 'a,b,c\n4,5,6\n"' - self.assertRaises(Exception, self.read_csv, StringIO(data), escapechar='\\') - + self.assertRaises(Exception, self.read_csv, + StringIO(data), escapechar='\\') class TestPythonParser(ParserTests, tm.TestCase): + def test_negative_skipfooter_raises(self): text = """#foo,a,b,c #foo,a,b,c @@ -2718,14 +2780,13 @@ def test_fwf_colspecs_None(self): expected = DataFrame([[123456, 456], [456789, 789]]) tm.assert_frame_equal(result, expected) - def test_fwf_regression(self): # GH 3594 - #### turns out 'T060' is parsable as a datetime slice! + # turns out 'T060' is parsable as a datetime slice! - tzlist = [1,10,20,30,60,80,100] + tzlist = [1, 10, 20, 30, 60, 80, 100] ntz = len(tzlist) - tcolspecs = [16]+[8]*ntz + tcolspecs = [16] + [8] * ntz tcolnames = ['SST'] + ["T%03d" % z for z in tzlist[1:]] data = """ 2009164202000 9.5403 9.4105 8.6571 7.8372 6.0612 5.8843 5.5192 2009164203000 9.5435 9.2010 8.6167 7.8176 6.0804 5.8728 5.4869 @@ -2740,27 +2801,28 @@ def test_fwf_regression(self): names=tcolnames, widths=tcolspecs, parse_dates=True, - date_parser=lambda s: datetime.strptime(s,'%Y%j%H%M%S')) + date_parser=lambda s: datetime.strptime(s, '%Y%j%H%M%S')) for c in df.columns: - res = df.loc[:,c] + res = df.loc[:, c] self.assertTrue(len(res)) def test_fwf_for_uint8(self): data = """1421302965.213420 PRI=3 PGN=0xef00 DST=0x17 SRC=0x28 04 154 00 00 00 00 00 127 1421302964.226776 PRI=6 PGN=0xf002 SRC=0x47 243 00 00 255 247 00 00 71""" df = read_fwf(StringIO(data), - colspecs=[(0,17),(25,26),(33,37),(49,51),(58,62),(63,1000)], - names=['time','pri','pgn','dst','src','data'], - converters={ - 'pgn':lambda x: int(x,16), - 'src':lambda x: int(x,16), - 'dst':lambda x: int(x,16), - 'data':lambda x: len(x.split(' '))}) - - expected = DataFrame([[1421302965.213420,3,61184,23,40,8], - [1421302964.226776,6,61442,None, 71,8]], - columns = ["time", "pri", "pgn", "dst", "src","data"]) + colspecs=[(0, 17), (25, 26), (33, 37), + (49, 51), (58, 62), (63, 1000)], + names=['time', 'pri', 'pgn', 'dst', 'src', 'data'], + converters={ + 'pgn': lambda x: int(x, 16), + 'src': lambda x: int(x, 16), + 'dst': lambda x: int(x, 16), + 'data': lambda x: len(x.split(' '))}) + + expected = DataFrame([[1421302965.213420, 3, 61184, 23, 40, 8], + [1421302964.226776, 6, 61442, None, 71, 8]], + columns=["time", "pri", "pgn", "dst", "src", "data"]) expected["dst"] = expected["dst"].astype(object) tm.assert_frame_equal(df, expected) @@ -2792,14 +2854,16 @@ def test_fwf_compression(self): def test_BytesIO_input(self): if not compat.PY3: - raise nose.SkipTest("Bytes-related test - only needs to work on Python 3") - result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[2,2], encoding='utf8') + raise nose.SkipTest( + "Bytes-related test - only needs to work on Python 3") + result = pd.read_fwf(BytesIO("שלום\nשלום".encode('utf8')), widths=[ + 2, 2], encoding='utf8') expected = pd.DataFrame([["של", "ום"]], columns=["של", "ום"]) tm.assert_frame_equal(result, expected) data = BytesIO("שלום::1234\n562::123".encode('cp1255')) result = pd.read_table(data, sep="::", engine='python', encoding='cp1255') - expected = pd.DataFrame([[562, 123]], columns=["שלום","1234"]) + expected = pd.DataFrame([[562, 123]], columns=["שלום", "1234"]) tm.assert_frame_equal(result, expected) def test_verbose_import(self): @@ -2819,7 +2883,8 @@ def test_verbose_import(self): try: # it works! df = self.read_csv(StringIO(text), verbose=True) - self.assertEqual(buf.getvalue(), 'Filled 3 NA values in column a\n') + self.assertEqual( + buf.getvalue(), 'Filled 3 NA values in column a\n') finally: sys.stdout = sys.__stdout__ @@ -2839,19 +2904,22 @@ def test_verbose_import(self): try: # it works! df = self.read_csv(StringIO(text), verbose=True, index_col=0) - self.assertEqual(buf.getvalue(), 'Filled 1 NA values in column a\n') + self.assertEqual( + buf.getvalue(), 'Filled 1 NA values in column a\n') finally: sys.stdout = sys.__stdout__ def test_float_precision_specified(self): - # Should raise an error if float_precision (C parser option) is specified + # Should raise an error if float_precision (C parser option) is + # specified with tm.assertRaisesRegexp(ValueError, "The 'float_precision' option " "is not supported with the 'python' engine"): self.read_csv(StringIO('a,b,c\n1,2,3'), float_precision='high') def test_iteration_open_handle(self): if PY3: - raise nose.SkipTest("won't work in Python 3 {0}".format(sys.version_info)) + raise nose.SkipTest( + "won't work in Python 3 {0}".format(sys.version_info)) with tm.ensure_clean() as path: with open(path, 'wb') as f: @@ -2923,13 +2991,15 @@ def test_iterator(self): """ reader = self.read_csv(StringIO(data), iterator=True) result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) tm.assert_frame_equal(result[0], expected) # chunksize = 1 reader = self.read_csv(StringIO(data), chunksize=1) result = list(reader) - expected = DataFrame(dict(A = [1,4,7], B = [2,5,8], C = [3,6,9]), index=['foo','bar','baz']) + expected = DataFrame(dict(A=[1, 4, 7], B=[2, 5, 8], C=[ + 3, 6, 9]), index=['foo', 'bar', 'baz']) self.assertEqual(len(result), 3) tm.assert_frame_equal(pd.concat(result), expected) @@ -3124,8 +3194,8 @@ def test_read_table_buglet_4x_multiindex(self): # GH 6893 data = ' A B C\na b c\n1 3 7 0 3 6\n3 1 4 1 5 9' - expected = DataFrame.from_records([(1,3,7,0,3,6), (3,1,4,1,5,9)], - columns=list('abcABC'), index=list('abc')) + expected = DataFrame.from_records([(1, 3, 7, 0, 3, 6), (3, 1, 4, 1, 5, 9)], + columns=list('abcABC'), index=list('abc')) actual = self.read_table(StringIO(data), sep='\s+') tm.assert_frame_equal(actual, expected) @@ -3177,12 +3247,13 @@ def test_whitespace_lines(self): 5.,NaN,10.0 """ expected = [[1, 2., 4.], - [5., np.nan, 10.]] + [5., np.nan, 10.]] df = self.read_csv(StringIO(data)) tm.assert_almost_equal(df.values, expected) class TestFwfColspaceSniffing(tm.TestCase): + def test_full_file(self): # File with all values test = '''index A B C @@ -3270,7 +3341,8 @@ def test_multiple_delimiters(self): def test_variable_width_unicode(self): if not compat.PY3: - raise nose.SkipTest('Bytes-related test - only needs to work on Python 3') + raise nose.SkipTest( + 'Bytes-related test - only needs to work on Python 3') test = ''' שלום שלום ום שלל @@ -3298,7 +3370,8 @@ def read_table(self, *args, **kwds): def test_compact_ints(self): if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64, only when all tests are run") + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") data = ('0,1,0,0\n' '1,1,0,0\n' @@ -3322,7 +3395,8 @@ def test_parse_dates_empty_string(self): self.assertTrue(result['Date'].isnull()[1]) def test_usecols(self): - raise nose.SkipTest("Usecols is not supported in C High Memory engine.") + raise nose.SkipTest( + "Usecols is not supported in C High Memory engine.") def test_line_comment(self): data = """# empty @@ -3337,11 +3411,11 @@ def test_line_comment(self): tm.assert_almost_equal(df.values, expected) # check with delim_whitespace=True df = self.read_csv(StringIO(data.replace(',', ' ')), comment='#', - delim_whitespace=True) + delim_whitespace=True) tm.assert_almost_equal(df.values, expected) # check with custom line terminator df = self.read_csv(StringIO(data.replace('\n', '*')), comment='#', - lineterminator='*') + lineterminator='*') tm.assert_almost_equal(df.values, expected) def test_comment_skiprows(self): @@ -3360,7 +3434,7 @@ def test_comment_skiprows(self): tm.assert_almost_equal(df.values, expected) def test_skiprows_lineterminator(self): - #GH #9079 + # GH #9079 data = '\n'.join(['SMOSMANIA ThetaProbe-ML2X ', '2007/01/01 01:00 0.2140 U M ', '2007/01/01 02:00 0.2141 M O ', @@ -3386,23 +3460,24 @@ def test_skiprows_lineterminator(self): def test_trailing_spaces(self): data = "A B C \nrandom line with trailing spaces \nskip\n1,2,3\n1,2.,4.\nrandom line with trailing tabs\t\t\t\n \n5.1,NaN,10.0\n" expected = pd.DataFrame([[1., 2., 4.], - [5.1, np.nan, 10.]]) + [5.1, np.nan, 10.]]) # this should ignore six lines including lines with trailing # whitespace and blank lines. issues 8661, 8679 df = self.read_csv(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, - skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) df = self.read_table(StringIO(data.replace(',', ' ')), header=None, delim_whitespace=True, - skiprows=[0,1,2,3,5,6], skip_blank_lines=True) + skiprows=[0, 1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) - # test skipping set of rows after a row with trailing spaces, issue #8983 - expected = pd.DataFrame({"A":[1., 5.1], "B":[2., np.nan], - "C":[4., 10]}) + # test skipping set of rows after a row with trailing spaces, issue + # #8983 + expected = pd.DataFrame({"A": [1., 5.1], "B": [2., np.nan], + "C": [4., 10]}) df = self.read_table(StringIO(data.replace(',', ' ')), delim_whitespace=True, - skiprows=[1,2,3,5,6], skip_blank_lines=True) + skiprows=[1, 2, 3, 5, 6], skip_blank_lines=True) tm.assert_frame_equal(df, expected) def test_comment_header(self): @@ -3473,7 +3548,7 @@ def test_whitespace_lines(self): 5.,NaN,10.0 """ expected = [[1, 2., 4.], - [5., np.nan, 10.]] + [5., np.nan, 10.]] df = self.read_csv(StringIO(data)) tm.assert_almost_equal(df.values, expected) @@ -3482,7 +3557,8 @@ def test_passing_dtype(self): # This is a copy which should eventually be merged into ParserTests # when the dtype argument is supported by all engines. - df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) @@ -3490,24 +3566,26 @@ def test_passing_dtype(self): # GH 3795 # passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) - # we expect all object columns, so need to convert to test for equivalence + # we expect all object columns, so need to convert to test for + # equivalence result = result.astype(float) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, index_col=0) # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0, parse_dates=['B']) # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) def test_dtype_and_names_error(self): @@ -3521,17 +3599,20 @@ def test_dtype_and_names_error(self): 3.0 3 """ # base cases - result = self.read_csv(StringIO(data),sep='\s+',header=None) - expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]]) + result = self.read_csv(StringIO(data), sep='\s+', header=None) + expected = DataFrame([[1.0, 1], [2.0, 2], [3.0, 3]]) tm.assert_frame_equal(result, expected) - result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b']) - expected = DataFrame([[1.0,1],[2.0,2],[3.0,3]],columns=['a','b']) + result = self.read_csv(StringIO(data), sep='\s+', + header=None, names=['a', 'b']) + expected = DataFrame( + [[1.0, 1], [2.0, 2], [3.0, 3]], columns=['a', 'b']) tm.assert_frame_equal(result, expected) # fallback casting - result = self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : np.int32}) - expected = DataFrame([[1,1],[2,2],[3,3]],columns=['a','b']) + result = self.read_csv(StringIO( + data), sep='\s+', header=None, names=['a', 'b'], dtype={'a': np.int32}) + expected = DataFrame([[1, 1], [2, 2], [3, 3]], columns=['a', 'b']) expected['a'] = expected['a'].astype(np.int32) tm.assert_frame_equal(result, expected) @@ -3542,7 +3623,8 @@ def test_dtype_and_names_error(self): """ # fallback casting, but not castable with tm.assertRaisesRegexp(ValueError, 'cannot safely convert'): - self.read_csv(StringIO(data),sep='\s+',header=None,names=['a','b'],dtype={'a' : np.int32}) + self.read_csv(StringIO(data), sep='\s+', header=None, + names=['a', 'b'], dtype={'a': np.int32}) def test_fallback_to_python(self): # GH 6607 @@ -3551,13 +3633,12 @@ def test_fallback_to_python(self): # specify C engine with unsupported options (raise) with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', sep=None, - delim_whitespace=False) + delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', sep='\s') with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', skip_footer=1) - def test_buffer_overflow(self): # GH9205 # test certain malformed input files that cause buffer overflows in @@ -3569,7 +3650,8 @@ def test_buffer_overflow(self): try: df = self.read_table(StringIO(malf)) except Exception as cperr: - self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + self.assertIn( + 'Buffer overflow caught - possible malformed input file.', str(cperr)) def test_single_char_leading_whitespace(self): # GH 9710 @@ -3580,7 +3662,7 @@ def test_single_char_leading_whitespace(self): a b\n""" - expected = DataFrame({'MyColumn' : list('abab')}) + expected = DataFrame({'MyColumn': list('abab')}) result = self.read_csv(StringIO(data), delim_whitespace=True, skipinitialspace=True) @@ -3590,6 +3672,7 @@ def test_single_char_leading_whitespace(self): skipinitialspace=True) tm.assert_frame_equal(result, expected) + class TestCParserLowMemory(ParserTests, tm.TestCase): def read_csv(self, *args, **kwds): @@ -3617,14 +3700,15 @@ def test_compact_ints(self): self.assertEqual(result.to_records(index=False).dtype, ex_dtype) result = read_csv(StringIO(data), delimiter=',', header=None, - compact_ints=True, + compact_ints=True, use_unsigned=True) ex_dtype = np.dtype([(str(i), 'u1') for i in range(4)]) self.assertEqual(result.to_records(index=False).dtype, ex_dtype) def test_compact_ints_as_recarray(self): if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64, only when all tests are run") + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") data = ('0,1,0,0\n' '1,1,0,0\n' @@ -3647,17 +3731,21 @@ def test_precise_conversion(self): from decimal import Decimal normal_errors = [] precise_errors = [] - for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2 - text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision + for num in np.linspace(1., 2., num=500): # test numbers between 1 and 2 + text = 'a\n{0:.25}'.format(num) # 25 decimal digits of precision normal_val = float(self.read_csv(StringIO(text))['a'][0]) - precise_val = float(self.read_csv(StringIO(text), float_precision='high')['a'][0]) - roundtrip_val = float(self.read_csv(StringIO(text), float_precision='round_trip')['a'][0]) + precise_val = float(self.read_csv( + StringIO(text), float_precision='high')['a'][0]) + roundtrip_val = float(self.read_csv( + StringIO(text), float_precision='round_trip')['a'][0]) actual_val = Decimal(text[2:]) + def error(val): return abs(Decimal('{0:.100}'.format(val)) - actual_val) normal_errors.append(error(normal_val)) precise_errors.append(error(precise_val)) - self.assertEqual(roundtrip_val, float(text[2:])) # round-trip should match float() + # round-trip should match float() + self.assertEqual(roundtrip_val, float(text[2:])) self.assertTrue(sum(precise_errors) <= sum(normal_errors)) self.assertTrue(max(precise_errors) <= max(normal_errors)) @@ -3682,7 +3770,8 @@ def test_pass_dtype_as_recarray(self): 4,5.5""" if compat.is_platform_windows(): - raise nose.SkipTest("segfaults on win-64, only when all tests are run") + raise nose.SkipTest( + "segfaults on win-64, only when all tests are run") result = self.read_csv(StringIO(data), dtype={'one': 'u1', 1: 'S1'}, as_recarray=True) @@ -3713,35 +3802,42 @@ def test_empty_with_multiindex_pass_dtype(self): exp_idx = MultiIndex.from_arrays([np.empty(0, dtype='u1'), np.empty(0, dtype='O')], names=['one', 'two']) - expected = DataFrame({'three': np.empty(0, dtype=np.object)}, index=exp_idx) + expected = DataFrame( + {'three': np.empty(0, dtype=np.object)}, index=exp_idx) tm.assert_frame_equal(result, expected, check_index_type=False) def test_empty_with_mangled_column_pass_dtype_by_names(self): data = 'one,one' - result = self.read_csv(StringIO(data), dtype={'one': 'u1', 'one.1': 'f'}) + result = self.read_csv(StringIO(data), dtype={ + 'one': 'u1', 'one.1': 'f'}) - expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False) def test_empty_with_mangled_column_pass_dtype_by_indexes(self): data = 'one,one' result = self.read_csv(StringIO(data), dtype={0: 'u1', 1: 'f'}) - expected = DataFrame({'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) + expected = DataFrame( + {'one': np.empty(0, dtype='u1'), 'one.1': np.empty(0, dtype='f')}) tm.assert_frame_equal(result, expected, check_index_type=False) def test_empty_with_dup_column_pass_dtype_by_names(self): data = 'one,one' - result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) + result = self.read_csv( + StringIO(data), mangle_dupe_cols=False, dtype={'one': 'u1'}) expected = pd.concat([Series([], name='one', dtype='u1')] * 2, axis=1) tm.assert_frame_equal(result, expected, check_index_type=False) def test_empty_with_dup_column_pass_dtype_by_indexes(self): ### FIXME in GH9424 - raise nose.SkipTest("GH 9424; known failure read_csv with duplicate columns") + raise nose.SkipTest( + "GH 9424; known failure read_csv with duplicate columns") data = 'one,one' - result = self.read_csv(StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) + result = self.read_csv( + StringIO(data), mangle_dupe_cols=False, dtype={0: 'u1', 1: 'f'}) expected = pd.concat([Series([], name='one', dtype='u1'), Series([], name='one', dtype='f')], axis=1) tm.assert_frame_equal(result, expected, check_index_type=False) @@ -3758,13 +3854,13 @@ def test_usecols_dtypes(self): header=None, converters={'a': str}, dtype={'b': int, 'c': float}, - ) + ) result2 = self.read_csv(StringIO(data), usecols=(0, 2), - names=('a', 'b', 'c'), - header=None, - converters={'a': str}, - dtype={'b': int, 'c': float}, - ) + names=('a', 'b', 'c'), + header=None, + converters={'a': str}, + dtype={'b': int, 'c': float}, + ) self.assertTrue((result.dtypes == [object, np.int, np.float]).all()) self.assertTrue((result2.dtypes == [object, np.float]).all()) @@ -3869,7 +3965,7 @@ def test_decompression_regex_sep(self): # regex sep. Temporarily copied to TestPythonParser. # Here test for ValueError when passing regex sep: - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX result = self.read_csv(path, sep='::', compression='gzip') tm.assert_frame_equal(result, expected) @@ -3879,7 +3975,7 @@ def test_decompression_regex_sep(self): tmp.close() # GH 6607 - with tm.assertRaisesRegexp(ValueError, 'regex sep'): #XXX + with tm.assertRaisesRegexp(ValueError, 'regex sep'): # XXX result = self.read_csv(path, sep='::', compression='bz2') tm.assert_frame_equal(result, expected) @@ -4034,7 +4130,8 @@ def test_passing_dtype(self): # This is a copy which should eventually be merged into ParserTests # when the dtype argument is supported by all engines. - df = DataFrame(np.random.rand(5,2),columns=list('AB'),index=['1A','1B','1C','1D','1E']) + df = DataFrame(np.random.rand(5, 2), columns=list( + 'AB'), index=['1A', '1B', '1C', '1D', '1E']) with tm.ensure_clean('__passing_str_as_dtype__.csv') as path: df.to_csv(path) @@ -4042,24 +4139,26 @@ def test_passing_dtype(self): # GH 3795 # passing 'str' as the dtype result = self.read_csv(path, dtype=str, index_col=0) - tm.assert_series_equal(result.dtypes,Series({ 'A' : 'object', 'B' : 'object' })) + tm.assert_series_equal(result.dtypes, Series( + {'A': 'object', 'B': 'object'})) - # we expect all object columns, so need to convert to test for equivalence + # we expect all object columns, so need to convert to test for + # equivalence result = result.astype(float) - tm.assert_frame_equal(result,df) + tm.assert_frame_equal(result, df) # invalid dtype - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'foo', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'foo', 'B': 'float64'}, index_col=0) # valid but we don't support it (date) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0) - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'datetime64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'datetime64', 'B': 'float64'}, index_col=0, parse_dates=['B']) # valid but we don't support it - self.assertRaises(TypeError, self.read_csv, path, dtype={'A' : 'timedelta64', 'B' : 'float64' }, + self.assertRaises(TypeError, self.read_csv, path, dtype={'A': 'timedelta64', 'B': 'float64'}, index_col=0) def test_fallback_to_python(self): @@ -4069,7 +4168,7 @@ def test_fallback_to_python(self): # specify C engine with C-unsupported options (raise) with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', sep=None, - delim_whitespace=False) + delim_whitespace=False) with tm.assertRaisesRegexp(ValueError, 'does not support'): self.read_table(StringIO(data), engine='c', sep='\s') with tm.assertRaisesRegexp(ValueError, 'does not support'): @@ -4081,7 +4180,6 @@ def test_raise_on_sep_with_delim_whitespace(self): with tm.assertRaisesRegexp(ValueError, 'you can only specify one'): self.read_table(StringIO(data), sep='\s', delim_whitespace=True) - def test_buffer_overflow(self): # GH9205 # test certain malformed input files that cause buffer overflows in @@ -4093,7 +4191,8 @@ def test_buffer_overflow(self): try: df = self.read_table(StringIO(malf)) except Exception as cperr: - self.assertIn('Buffer overflow caught - possible malformed input file.', str(cperr)) + self.assertIn( + 'Buffer overflow caught - possible malformed input file.', str(cperr)) def test_single_char_leading_whitespace(self): # GH 9710 @@ -4104,7 +4203,7 @@ def test_single_char_leading_whitespace(self): a b\n""" - expected = DataFrame({'MyColumn' : list('abab')}) + expected = DataFrame({'MyColumn': list('abab')}) result = self.read_csv(StringIO(data), delim_whitespace=True, skipinitialspace=True) @@ -4233,7 +4332,7 @@ def test_fallback_to_python(self): # (options will be ignored on fallback, raise) with tm.assertRaisesRegexp(ValueError, 'Falling back'): pd.read_table(StringIO(data), sep=None, - delim_whitespace=False, dtype={'a': float}) + delim_whitespace=False, dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, 'Falling back'): pd.read_table(StringIO(data), sep='\s', dtype={'a': float}) with tm.assertRaisesRegexp(ValueError, 'Falling back'): @@ -4315,6 +4414,7 @@ def test_convert_sql_column_decimals(self): class TestUrlGz(tm.TestCase): + def setUp(self): dirpath = tm.get_data_path() localtable = os.path.join(dirpath, 'salary.table') @@ -4334,6 +4434,7 @@ def test_url_gz_infer(self): class TestS3(tm.TestCase): + def setUp(self): try: import boto @@ -4349,10 +4450,12 @@ def test_parse_public_s3_bucket(self): 's3://pandas-test/tips.csv' + ext, compression=comp) else: - df = pd.read_csv('s3://pandas-test/tips.csv' + ext, compression=comp) + df = pd.read_csv('s3://pandas-test/tips.csv' + + ext, compression=comp) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')), df) # Read public file from bucket with not-public contents df = pd.read_csv('s3://cant_get_it/tips.csv') @@ -4366,7 +4469,8 @@ def test_parse_public_s3n_bucket(self): df = pd.read_csv('s3n://pandas-test/tips.csv', nrows=10) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3a_bucket(self): @@ -4374,7 +4478,8 @@ def test_parse_public_s3a_bucket(self): df = pd.read_csv('s3a://pandas-test/tips.csv', nrows=10) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3_bucket_nrows(self): @@ -4385,10 +4490,12 @@ def test_parse_public_s3_bucket_nrows(self): 's3://pandas-test/tips.csv' + ext, compression=comp) else: - df = pd.read_csv('s3://pandas-test/tips.csv' + ext, nrows=10, compression=comp) + df = pd.read_csv('s3://pandas-test/tips.csv' + + ext, nrows=10, compression=comp) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_parse_public_s3_bucket_chunked(self): @@ -4406,12 +4513,15 @@ def test_parse_public_s3_bucket_chunked(self): chunksize=chunksize, compression=comp) self.assertEqual(df_reader.chunksize, chunksize) for i_chunk in [0, 1, 2]: - # Read a couple of chunks and make sure we see them properly. + # Read a couple of chunks and make sure we see them + # properly. df = df_reader.get_chunk() self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - true_df = local_tips.iloc[chunksize * i_chunk: chunksize * (i_chunk + 1)] - true_df = true_df.reset_index().drop('index', axis=1) # Chunking doesn't preserve row numbering + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + # Chunking doesn't preserve row numbering + true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network @@ -4429,8 +4539,10 @@ def test_parse_public_s3_bucket_chunked_python(self): df = df_reader.get_chunk() self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - true_df = local_tips.iloc[chunksize * i_chunk: chunksize * (i_chunk + 1)] - true_df = true_df.reset_index().drop('index', axis=1) # Chunking doesn't preserve row numbering + true_df = local_tips.iloc[ + chunksize * i_chunk: chunksize * (i_chunk + 1)] + # Chunking doesn't preserve row numbering + true_df = true_df.reset_index().drop('index', axis=1) tm.assert_frame_equal(true_df, df) @tm.network @@ -4440,7 +4552,8 @@ def test_parse_public_s3_bucket_python(self): compression=comp) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')), df) @tm.network def test_infer_s3_compression(self): @@ -4449,7 +4562,8 @@ def test_infer_s3_compression(self): engine='python', compression='infer') self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')), df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')), df) @tm.network def test_parse_public_s3_bucket_nrows_python(self): @@ -4458,13 +4572,14 @@ def test_parse_public_s3_bucket_nrows_python(self): nrows=10, compression=comp) self.assertTrue(isinstance(df, pd.DataFrame)) self.assertFalse(df.empty) - tm.assert_frame_equal(pd.read_csv(tm.get_data_path('tips.csv')).iloc[:10], df) + tm.assert_frame_equal(pd.read_csv( + tm.get_data_path('tips.csv')).iloc[:10], df) @tm.network def test_s3_fails(self): import boto with tm.assertRaisesRegexp(boto.exception.S3ResponseError, - 'S3ResponseError: 404 Not Found'): + 'S3ResponseError: 404 Not Found'): pd.read_csv('s3://nyqpug/asdf.csv') # Receive a permission error when trying to read a private bucket. diff --git a/pandas/io/tests/test_pickle.py b/pandas/io/tests/test_pickle.py index 2a4e429e28580..61f78b2b619fc 100644 --- a/pandas/io/tests/test_pickle.py +++ b/pandas/io/tests/test_pickle.py @@ -2,23 +2,18 @@ """ manage legacy pickle tests """ -from datetime import datetime, timedelta -import operator -import pickle as pkl import nose import os from distutils.version import LooseVersion -import numpy as np -import pandas.util.testing as tm import pandas as pd from pandas import Index -from pandas.sparse.tests import test_sparse -from pandas import compat from pandas.compat import u +from pandas.sparse.tests import test_sparse from pandas.util.misc import is_little_endian import pandas +import pandas.util.testing as tm from pandas.tseries.offsets import Day, MonthEnd @@ -34,26 +29,29 @@ class TestPickle(): 3. Move the created pickle to "data/legacy_pickle/" directory. NOTE: TestPickle can't be a subclass of tm.Testcase to use test generator. - http://stackoverflow.com/questions/6689537/nose-test-generators-inside-class + http://stackoverflow.com/questions/6689537/ + nose-test-generators-inside-class """ _multiprocess_can_split_ = True def setUp(self): - from pandas.io.tests.generate_legacy_storage_files import create_pickle_data + from pandas.io.tests.generate_legacy_storage_files import ( + create_pickle_data) self.data = create_pickle_data() self.path = u('__%s__.pickle' % tm.rands(10)) def compare_element(self, result, expected, typ, version=None): - if isinstance(expected,Index): + if isinstance(expected, Index): tm.assert_index_equal(expected, result) return if typ.startswith('sp_'): - comparator = getattr(test_sparse,"assert_%s_equal" % typ) - comparator(result,expected,exact_indices=False) + comparator = getattr(test_sparse, "assert_%s_equal" % typ) + comparator(result, expected, exact_indices=False) else: - comparator = getattr(tm,"assert_%s_equal" % typ,tm.assert_almost_equal) - comparator(result,expected) + comparator = getattr(tm, "assert_%s_equal" % + typ, tm.assert_almost_equal) + comparator(result, expected) def compare(self, vf, version): @@ -76,7 +74,8 @@ def compare(self, vf, version): # use a specific comparator # if available - comparator = getattr(self,"compare_{typ}_{dt}".format(typ=typ,dt=dt), self.compare_element) + comparator = getattr(self, "compare_{typ}_{dt}".format( + typ=typ, dt=dt), self.compare_element) comparator(result, expected, typ, version) return data @@ -113,7 +112,8 @@ def read_pickles(self, version): if 'series' in data: if 'ts' in data['series']: - self._validate_timeseries(data['series']['ts'], self.data['series']['ts']) + self._validate_timeseries( + data['series']['ts'], self.data['series']['ts']) self._validate_frequency(data['series']['ts']) if 'index' in data: if 'period' in data['index']: @@ -136,12 +136,13 @@ def test_round_trip_current(self): try: import cPickle as c_pickle - def c_pickler(obj,path): - with open(path,'wb') as fh: - c_pickle.dump(obj,fh,protocol=-1) + + def c_pickler(obj, path): + with open(path, 'wb') as fh: + c_pickle.dump(obj, fh, protocol=-1) def c_unpickler(path): - with open(path,'rb') as fh: + with open(path, 'rb') as fh: fh.seek(0) return c_pickle.load(fh) except: @@ -150,26 +151,26 @@ def c_unpickler(path): import pickle as python_pickle - def python_pickler(obj,path): - with open(path,'wb') as fh: - python_pickle.dump(obj,fh,protocol=-1) + def python_pickler(obj, path): + with open(path, 'wb') as fh: + python_pickle.dump(obj, fh, protocol=-1) def python_unpickler(path): - with open(path,'rb') as fh: + with open(path, 'rb') as fh: fh.seek(0) return python_pickle.load(fh) for typ, dv in self.data.items(): for dt, expected in dv.items(): - for writer in [pd.to_pickle, c_pickler, python_pickler ]: + for writer in [pd.to_pickle, c_pickler, python_pickler]: if writer is None: continue with tm.ensure_clean(self.path) as path: # test writing with each pickler - writer(expected,path) + writer(expected, path) # test reading with each unpickler result = pd.read_pickle(path) @@ -212,7 +213,6 @@ def _validate_periodindex(self, pickled, current): if __name__ == '__main__': - import nose nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'], # '--with-coverage', '--cover-package=pandas.core'], exit=False) diff --git a/pandas/io/tests/test_pytables.py b/pandas/io/tests/test_pytables.py index 38f5150516551..b08d24747bcd3 100644 --- a/pandas/io/tests/test_pytables.py +++ b/pandas/io/tests/test_pytables.py @@ -28,6 +28,7 @@ AttributeConflictWarning, DuplicateWarning, PossibleDataLossError, ClosedFileError) from pandas.io import pytables as pytables +import pandas.core.common as com import pandas.util.testing as tm from pandas.util.testing import (assert_panel4d_equal, assert_panel_equal, @@ -37,8 +38,6 @@ from pandas import concat, Timestamp from pandas import compat from pandas.compat import range, lrange, u -from pandas.util.testing import assert_produces_warning -from numpy.testing.decorators import slow try: import tables @@ -57,6 +56,8 @@ skip_compression = PY3 and is_platform_windows() # contextmanager to ensure the file cleanup + + def safe_remove(path): if path is not None: try: @@ -75,12 +76,12 @@ def safe_close(store): def create_tempfile(path): """ create an unopened named temporary file """ - return os.path.join(tempfile.gettempdir(),path) + return os.path.join(tempfile.gettempdir(), path) @contextmanager def ensure_clean_store(path, mode='a', complevel=None, complib=None, - fletcher32=False): + fletcher32=False): try: @@ -106,10 +107,10 @@ def ensure_clean_path(path): """ try: if isinstance(path, list): - filenames = [ create_tempfile(p) for p in path ] + filenames = [create_tempfile(p) for p in path] yield filenames else: - filenames = [ create_tempfile(path) ] + filenames = [create_tempfile(path)] yield filenames[0] finally: for f in filenames: @@ -118,8 +119,9 @@ def ensure_clean_path(path): # set these parameters so we don't have file sharing tables.parameters.MAX_NUMEXPR_THREADS = 1 -tables.parameters.MAX_BLOSC_THREADS = 1 -tables.parameters.MAX_THREADS = 1 +tables.parameters.MAX_BLOSC_THREADS = 1 +tables.parameters.MAX_THREADS = 1 + def _maybe_remove(store, key): """For tests using tables, try removing the table to be sure there is @@ -209,27 +211,27 @@ def test_context(self): def test_conv_read_write(self): path = create_tempfile(self.path) try: - def roundtrip(key, obj,**kwargs): - obj.to_hdf(path, key,**kwargs) + def roundtrip(key, obj, **kwargs): + obj.to_hdf(path, key, **kwargs) return read_hdf(path, key) o = tm.makeTimeSeries() - assert_series_equal(o, roundtrip('series',o)) + assert_series_equal(o, roundtrip('series', o)) o = tm.makeStringSeries() - assert_series_equal(o, roundtrip('string_series',o)) + assert_series_equal(o, roundtrip('string_series', o)) o = tm.makeDataFrame() - assert_frame_equal(o, roundtrip('frame',o)) + assert_frame_equal(o, roundtrip('frame', o)) o = tm.makePanel() - assert_panel_equal(o, roundtrip('panel',o)) + assert_panel_equal(o, roundtrip('panel', o)) # table df = DataFrame(dict(A=lrange(5), B=lrange(5))) - df.to_hdf(path,'table',append=True) - result = read_hdf(path, 'table', where = ['index>2']) - assert_frame_equal(df[df.index>2],result) + df.to_hdf(path, 'table', append=True) + result = read_hdf(path, 'table', where=['index>2']) + assert_frame_equal(df[df.index > 2], result) finally: safe_remove(path) @@ -248,7 +250,6 @@ def test_long_strings(self): result = store.select('df') assert_frame_equal(df, result) - def test_api(self): # GH4584 @@ -256,80 +257,84 @@ def test_api(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path,'df',append=True,format='table') - df.iloc[10:].to_hdf(path,'df',append=True,format='table') - assert_frame_equal(read_hdf(path,'df'),df) + df.iloc[:10].to_hdf(path, 'df', append=True, format='table') + df.iloc[10:].to_hdf(path, 'df', append=True, format='table') + assert_frame_equal(read_hdf(path, 'df'), df) # append to False - df.iloc[:10].to_hdf(path,'df',append=False,format='table') - df.iloc[10:].to_hdf(path,'df',append=True,format='table') - assert_frame_equal(read_hdf(path,'df'),df) + df.iloc[:10].to_hdf(path, 'df', append=False, format='table') + df.iloc[10:].to_hdf(path, 'df', append=True, format='table') + assert_frame_equal(read_hdf(path, 'df'), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.iloc[:10].to_hdf(path,'df',append=True) - df.iloc[10:].to_hdf(path,'df',append=True,format='table') - assert_frame_equal(read_hdf(path,'df'),df) + df.iloc[:10].to_hdf(path, 'df', append=True) + df.iloc[10:].to_hdf(path, 'df', append=True, format='table') + assert_frame_equal(read_hdf(path, 'df'), df) # append to False - df.iloc[:10].to_hdf(path,'df',append=False,format='table') - df.iloc[10:].to_hdf(path,'df',append=True) - assert_frame_equal(read_hdf(path,'df'),df) + df.iloc[:10].to_hdf(path, 'df', append=False, format='table') + df.iloc[10:].to_hdf(path, 'df', append=True) + assert_frame_equal(read_hdf(path, 'df'), df) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path,'df',append=False,format='fixed') - assert_frame_equal(read_hdf(path,'df'),df) + df.to_hdf(path, 'df', append=False, format='fixed') + assert_frame_equal(read_hdf(path, 'df'), df) - df.to_hdf(path,'df',append=False,format='f') - assert_frame_equal(read_hdf(path,'df'),df) + df.to_hdf(path, 'df', append=False, format='f') + assert_frame_equal(read_hdf(path, 'df'), df) - df.to_hdf(path,'df',append=False) - assert_frame_equal(read_hdf(path,'df'),df) + df.to_hdf(path, 'df', append=False) + assert_frame_equal(read_hdf(path, 'df'), df) - df.to_hdf(path,'df') - assert_frame_equal(read_hdf(path,'df'),df) + df.to_hdf(path, 'df') + assert_frame_equal(read_hdf(path, 'df'), df) with ensure_clean_store(self.path) as store: path = store._path df = tm.makeDataFrame() - _maybe_remove(store,'df') - store.append('df',df.iloc[:10],append=True,format='table') - store.append('df',df.iloc[10:],append=True,format='table') - assert_frame_equal(store.select('df'),df) + _maybe_remove(store, 'df') + store.append('df', df.iloc[:10], append=True, format='table') + store.append('df', df.iloc[10:], append=True, format='table') + assert_frame_equal(store.select('df'), df) # append to False - _maybe_remove(store,'df') - store.append('df',df.iloc[:10],append=False,format='table') - store.append('df',df.iloc[10:],append=True,format='table') - assert_frame_equal(store.select('df'),df) + _maybe_remove(store, 'df') + store.append('df', df.iloc[:10], append=False, format='table') + store.append('df', df.iloc[10:], append=True, format='table') + assert_frame_equal(store.select('df'), df) # formats - _maybe_remove(store,'df') - store.append('df',df.iloc[:10],append=False,format='table') - store.append('df',df.iloc[10:],append=True,format='table') - assert_frame_equal(store.select('df'),df) + _maybe_remove(store, 'df') + store.append('df', df.iloc[:10], append=False, format='table') + store.append('df', df.iloc[10:], append=True, format='table') + assert_frame_equal(store.select('df'), df) - _maybe_remove(store,'df') - store.append('df',df.iloc[:10],append=False,format='table') - store.append('df',df.iloc[10:],append=True,format=None) - assert_frame_equal(store.select('df'),df) + _maybe_remove(store, 'df') + store.append('df', df.iloc[:10], append=False, format='table') + store.append('df', df.iloc[10:], append=True, format=None) + assert_frame_equal(store.select('df'), df) with ensure_clean_path(self.path) as path: # invalid df = tm.makeDataFrame() - self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='f') - self.assertRaises(ValueError, df.to_hdf, path,'df',append=True,format='fixed') + self.assertRaises(ValueError, df.to_hdf, path, + 'df', append=True, format='f') + self.assertRaises(ValueError, df.to_hdf, path, + 'df', append=True, format='fixed') - self.assertRaises(TypeError, df.to_hdf, path,'df',append=True,format='foo') - self.assertRaises(TypeError, df.to_hdf, path,'df',append=False,format='bar') + self.assertRaises(TypeError, df.to_hdf, path, + 'df', append=True, format='foo') + self.assertRaises(TypeError, df.to_hdf, path, + 'df', append=False, format='bar') - #File path doesn't exist + # File path doesn't exist path = "" self.assertRaises(IOError, read_hdf, path, 'df') @@ -339,41 +344,41 @@ def test_api_default_format(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - pandas.set_option('io.hdf.default_format','fixed') - _maybe_remove(store,'df') - store.put('df',df) + pandas.set_option('io.hdf.default_format', 'fixed') + _maybe_remove(store, 'df') + store.put('df', df) self.assertFalse(store.get_storer('df').is_table) - self.assertRaises(ValueError, store.append, 'df2',df) + self.assertRaises(ValueError, store.append, 'df2', df) - pandas.set_option('io.hdf.default_format','table') - _maybe_remove(store,'df') - store.put('df',df) + pandas.set_option('io.hdf.default_format', 'table') + _maybe_remove(store, 'df') + store.put('df', df) self.assertTrue(store.get_storer('df').is_table) - _maybe_remove(store,'df2') - store.append('df2',df) + _maybe_remove(store, 'df2') + store.append('df2', df) self.assertTrue(store.get_storer('df').is_table) - pandas.set_option('io.hdf.default_format',None) + pandas.set_option('io.hdf.default_format', None) with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - pandas.set_option('io.hdf.default_format','fixed') - df.to_hdf(path,'df') + pandas.set_option('io.hdf.default_format', 'fixed') + df.to_hdf(path, 'df') with get_store(path) as store: self.assertFalse(store.get_storer('df').is_table) - self.assertRaises(ValueError, df.to_hdf, path,'df2', append=True) + self.assertRaises(ValueError, df.to_hdf, path, 'df2', append=True) - pandas.set_option('io.hdf.default_format','table') - df.to_hdf(path,'df3') + pandas.set_option('io.hdf.default_format', 'table') + df.to_hdf(path, 'df3') with HDFStore(path) as store: self.assertTrue(store.get_storer('df3').is_table) - df.to_hdf(path,'df4',append=True) + df.to_hdf(path, 'df4', append=True) with HDFStore(path) as store: self.assertTrue(store.get_storer('df4').is_table) - pandas.set_option('io.hdf.default_format',None) + pandas.set_option('io.hdf.default_format', None) def test_keys(self): @@ -408,9 +413,9 @@ def test_repr(self): df['int2'] = 2 df['timestamp1'] = Timestamp('20010102') df['timestamp2'] = Timestamp('20010103') - df['datetime1'] = datetime.datetime(2001,1,2,0,0) - df['datetime2'] = datetime.datetime(2001,1,3,0,0) - df.ix[3:6,['obj1']] = np.nan + df['datetime1'] = datetime.datetime(2001, 1, 2, 0, 0) + df['datetime2'] = datetime.datetime(2001, 1, 3, 0, 0) + df.ix[3:6, ['obj1']] = np.nan df = df.consolidate()._convert(datetime=True) warnings.filterwarnings('ignore', category=PerformanceWarning) @@ -418,7 +423,7 @@ def test_repr(self): warnings.filterwarnings('always', category=PerformanceWarning) # make a random group in hdf space - store._handle.create_group(store._handle.root,'bah') + store._handle.create_group(store._handle.root, 'bah') repr(store) str(store) @@ -427,7 +432,7 @@ def test_repr(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - store.append('df',df) + store.append('df', df) s = store.get_storer('df') repr(s) @@ -448,7 +453,8 @@ def test_contains(self): self.assertNotIn('bar', store) # GH 2694 - warnings.filterwarnings('ignore', category=tables.NaturalNameWarning) + warnings.filterwarnings( + 'ignore', category=tables.NaturalNameWarning) store['node())'] = tm.makeDataFrame() self.assertIn('node())', store) @@ -469,8 +475,8 @@ def test_versioning(self): _maybe_remove(store, 'df2') store.append('df2', df) - # this is an error because its table_type is appendable, but no version - # info + # this is an error because its table_type is appendable, but no + # version info store.get_node('df2')._v_attrs.pandas_version = None self.assertRaises(Exception, store.select, 'df2') @@ -483,41 +489,43 @@ def check(mode): with ensure_clean_path(self.path) as path: # constructor - if mode in ['r','r+']: + if mode in ['r', 'r+']: self.assertRaises(IOError, HDFStore, path, mode=mode) else: - store = HDFStore(path,mode=mode) + store = HDFStore(path, mode=mode) self.assertEqual(store._handle.mode, mode) store.close() with ensure_clean_path(self.path) as path: # context - if mode in ['r','r+']: + if mode in ['r', 'r+']: def f(): - with HDFStore(path,mode=mode) as store: + with HDFStore(path, mode=mode) as store: # noqa pass self.assertRaises(IOError, f) else: - with HDFStore(path,mode=mode) as store: + with HDFStore(path, mode=mode) as store: self.assertEqual(store._handle.mode, mode) with ensure_clean_path(self.path) as path: # conv write - if mode in ['r','r+']: - self.assertRaises(IOError, df.to_hdf, path, 'df', mode=mode) - df.to_hdf(path,'df',mode='w') + if mode in ['r', 'r+']: + self.assertRaises(IOError, df.to_hdf, + path, 'df', mode=mode) + df.to_hdf(path, 'df', mode='w') else: - df.to_hdf(path,'df',mode=mode) + df.to_hdf(path, 'df', mode=mode) # conv read if mode in ['w']: - self.assertRaises(KeyError, read_hdf, path, 'df', mode=mode) + self.assertRaises(KeyError, read_hdf, + path, 'df', mode=mode) else: - result = read_hdf(path,'df',mode=mode) - assert_frame_equal(result,df) + result = read_hdf(path, 'df', mode=mode) + assert_frame_equal(result, df) check('r') check('r+') @@ -528,7 +536,7 @@ def test_reopen_handle(self): with ensure_clean_path(self.path) as path: - store = HDFStore(path,mode='a') + store = HDFStore(path, mode='a') store['a'] = tm.makeTimeSeries() # invalid mode change @@ -543,7 +551,7 @@ def test_reopen_handle(self): store.close() self.assertFalse(store.is_open) - store = HDFStore(path,mode='a') + store = HDFStore(path, mode='a') store['a'] = tm.makeTimeSeries() # reopen as read @@ -577,12 +585,13 @@ def test_open_args(self): df = tm.makeDataFrame() # create an in memory store - store = HDFStore(path,mode='a',driver='H5FD_CORE',driver_core_backing_store=0) + store = HDFStore(path, mode='a', driver='H5FD_CORE', + driver_core_backing_store=0) store['df'] = df - store.append('df2',df) + store.append('df2', df) - tm.assert_frame_equal(store['df'],df) - tm.assert_frame_equal(store['df2'],df) + tm.assert_frame_equal(store['df'], df) + tm.assert_frame_equal(store['df2'], df) store.close() @@ -620,7 +629,7 @@ def test_getattr(self): # test attribute access result = store.a tm.assert_series_equal(result, s) - result = getattr(store,'a') + result = getattr(store, 'a') tm.assert_series_equal(result, s) df = tm.makeTimeDataFrame() @@ -631,12 +640,12 @@ def test_getattr(self): # errors self.assertRaises(AttributeError, getattr, store, 'd') - for x in ['mode','path','handle','complib']: + for x in ['mode', 'path', 'handle', 'complib']: self.assertRaises(AttributeError, getattr, store, x) # not stores - for x in ['mode','path','handle','complib']: - getattr(store,"_%s" % x) + for x in ['mode', 'path', 'handle', 'complib']: + getattr(store, "_%s" % x) def test_put(self): @@ -655,10 +664,11 @@ def test_put(self): self.assertRaises( ValueError, store.put, 'b', df[10:], append=True) - # node does not currently exist, test _is_table_type returns False in - # this case + # node does not currently exist, test _is_table_type returns False + # in this case # _maybe_remove(store, 'f') - # self.assertRaises(ValueError, store.put, 'f', df[10:], append=True) + # self.assertRaises(ValueError, store.put, 'f', df[10:], + # append=True) # can't put to a table (use append instead) self.assertRaises(ValueError, store.put, 'c', df[10:], append=True) @@ -683,7 +693,9 @@ def test_put_string_index(self): tm.assert_frame_equal(store['b'], df) # mixed length - index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + ["I am a very long string index: %s" % i for i in range(20)]) + index = Index(['abcdefghijklmnopqrstuvwxyz1234567890'] + + ["I am a very long string index: %s" % i + for i in range(20)]) s = Series(np.arange(21), index=index) df = DataFrame({'A': s, 'B': s}) store['a'] = s @@ -747,11 +759,11 @@ def test_put_mixed_type(self): # cannot use assert_produces_warning here for some reason # a PendingDeprecationWarning is also raised? warnings.filterwarnings('ignore', category=PerformanceWarning) - store.put('df',df) + store.put('df', df) warnings.filterwarnings('always', category=PerformanceWarning) expected = store.get('df') - tm.assert_frame_equal(expected,df) + tm.assert_frame_equal(expected, df) def test_append(self): @@ -773,7 +785,8 @@ def test_append(self): tm.assert_frame_equal(store['df3'], df) # this is allowed by almost always don't want to do it - with tm.assert_produces_warning(expected_warning=tables.NaturalNameWarning): + with tm.assert_produces_warning( + expected_warning=tables.NaturalNameWarning): _maybe_remove(store, '/df3 foo') store.append('/df3 foo', df[:10]) store.append('/df3 foo', df[10:]) @@ -796,9 +809,9 @@ def test_append(self): # test using axis labels _maybe_remove(store, 'p4d') store.append('p4d', p4d.ix[:, :, :10, :], axes=[ - 'items', 'major_axis', 'minor_axis']) + 'items', 'major_axis', 'minor_axis']) store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'items', 'major_axis', 'minor_axis']) + 'items', 'major_axis', 'minor_axis']) assert_panel4d_equal(store['p4d'], p4d) # test using differnt number of items on each axis @@ -827,18 +840,24 @@ def test_append(self): tm.assert_frame_equal(store['df'], df) # uints - test storage of uints - uint_data = DataFrame({'u08' : Series(np.random.random_integers(0, high=255, size=5), dtype=np.uint8), - 'u16' : Series(np.random.random_integers(0, high=65535, size=5), dtype=np.uint16), - 'u32' : Series(np.random.random_integers(0, high=2**30, size=5), dtype=np.uint32), - 'u64' : Series([2**58, 2**59, 2**60, 2**61, 2**62], dtype=np.uint64)}, - index=np.arange(5)) + uint_data = DataFrame({ + 'u08': Series(np.random.random_integers(0, high=255, size=5), + dtype=np.uint8), + 'u16': Series(np.random.random_integers(0, high=65535, size=5), + dtype=np.uint16), + 'u32': Series(np.random.random_integers(0, high=2**30, size=5), + dtype=np.uint32), + 'u64': Series([2**58, 2**59, 2**60, 2**61, 2**62], + dtype=np.uint64)}, index=np.arange(5)) _maybe_remove(store, 'uints') store.append('uints', uint_data) tm.assert_frame_equal(store['uints'], uint_data) # uints - test storage of uints in indexable columns _maybe_remove(store, 'uints') - store.append('uints', uint_data, data_columns=['u08','u16','u32']) # 64-bit indices not yet supported + # 64-bit indices not yet supported + store.append('uints', uint_data, data_columns=[ + 'u08', 'u16', 'u32']) tm.assert_frame_equal(store['uints'], uint_data) def test_append_series(self): @@ -867,21 +886,21 @@ def test_append_series(self): self.assertEqual(result.name, ns.name) # select on the values - expected = ns[ns>60] - result = store.select('ns',Term('foo>60')) - tm.assert_series_equal(result,expected) + expected = ns[ns > 60] + result = store.select('ns', Term('foo>60')) + tm.assert_series_equal(result, expected) # select on the index and values - expected = ns[(ns>70) & (ns.index<90)] - result = store.select('ns',[Term('foo>70'), Term('index<90')]) - tm.assert_series_equal(result,expected) + expected = ns[(ns > 70) & (ns.index < 90)] + result = store.select('ns', [Term('foo>70'), Term('index<90')]) + tm.assert_series_equal(result, expected) # multi-index - mi = DataFrame(np.random.randn(5,1),columns=['A']) + mi = DataFrame(np.random.randn(5, 1), columns=['A']) mi['B'] = np.arange(len(mi)) mi['C'] = 'foo' - mi.loc[3:5,'C'] = 'bar' - mi.set_index(['C','B'],inplace=True) + mi.loc[3:5, 'C'] = 'bar' + mi.set_index(['C', 'B'], inplace=True) s = mi.stack() s.index = s.index.droplevel(2) store.append('mi', s) @@ -893,36 +912,37 @@ def test_store_index_types(self): with ensure_clean_store(self.path) as store: - def check(format,index): - df = DataFrame(np.random.randn(10,2),columns=list('AB')) + def check(format, index): + df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df.index = index(len(df)) _maybe_remove(store, 'df') - store.put('df',df,format=format) - assert_frame_equal(df,store['df']) + store.put('df', df, format=format) + assert_frame_equal(df, store['df']) - for index in [ tm.makeFloatIndex, tm.makeStringIndex, tm.makeIntIndex, - tm.makeDateIndex ]: + for index in [tm.makeFloatIndex, tm.makeStringIndex, + tm.makeIntIndex, tm.makeDateIndex]: - check('table',index) - check('fixed',index) + check('table', index) + check('fixed', index) # period index currently broken for table # seee GH7796 FIXME - check('fixed',tm.makePeriodIndex) - #check('table',tm.makePeriodIndex) + check('fixed', tm.makePeriodIndex) + # check('table',tm.makePeriodIndex) # unicode index = tm.makeUnicodeIndex if compat.PY3: - check('table',index) - check('fixed',index) + check('table', index) + check('fixed', index) else: # only support for fixed types (and they have a perf warning) self.assertRaises(TypeError, check, 'table', index) - with tm.assert_produces_warning(expected_warning=PerformanceWarning): - check('fixed',index) + with tm.assert_produces_warning( + expected_warning=PerformanceWarning): + check('fixed', index) def test_encoding(self): @@ -930,21 +950,22 @@ def test_encoding(self): raise nose.SkipTest('system byteorder is not little') with ensure_clean_store(self.path) as store: - df = DataFrame(dict(A='foo',B='bar'),index=range(5)) - df.loc[2,'A'] = np.nan - df.loc[3,'B'] = np.nan + df = DataFrame(dict(A='foo', B='bar'), index=range(5)) + df.loc[2, 'A'] = np.nan + df.loc[3, 'B'] = np.nan _maybe_remove(store, 'df') store.append('df', df, encoding='ascii') tm.assert_frame_equal(store['df'], df) expected = df.reindex(columns=['A']) - result = store.select('df',Term('columns=A',encoding='ascii')) - tm.assert_frame_equal(result,expected) + result = store.select('df', Term('columns=A', encoding='ascii')) + tm.assert_frame_equal(result, expected) def test_latin_encoding(self): if compat.PY2: - self.assertRaisesRegexp(TypeError, '\[unicode\] is not implemented as a table column') + self.assertRaisesRegexp( + TypeError, '\[unicode\] is not implemented as a table column') return values = [[b'E\xc9, 17', b'', b'a', b'b', b'c'], @@ -973,7 +994,7 @@ def _try_decode(x, encoding='latin-1'): def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): with ensure_clean_path(self.path) as store: s.to_hdf(store, key, format='table', encoding=encoding, - nan_rep=nan_rep) + nan_rep=nan_rep) retr = read_hdf(store, key) s_nan = s.replace(nan_rep, np.nan) assert_series_equal(s_nan, retr) @@ -985,25 +1006,26 @@ def roundtrip(s, key='data', encoding='latin-1', nan_rep=''): # for x in examples: # roundtrip(s, nan_rep=b'\xf8\xfc') - def test_append_some_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A' : Series(np.random.randn(20)).astype('int32'), - 'A1' : np.random.randn(20), - 'A2' : np.random.randn(20), - 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + df = DataFrame({'A': Series(np.random.randn(20)).astype('int32'), + 'A1': np.random.randn(20), + 'A2': np.random.randn(20), + 'B': 'foo', 'C': 'bar', + 'D': Timestamp("20010101"), + 'E': datetime.datetime(2001, 1, 2, 0, 0)}, index=np.arange(20)) # some nans _maybe_remove(store, 'df1') - df.ix[0:15,['A1','B','D','E']] = np.nan + df.ix[0:15, ['A1', 'B', 'D', 'E']] = np.nan store.append('df1', df[:10]) store.append('df1', df[10:]) tm.assert_frame_equal(store['df1'], df) # first column df1 = df.copy() - df1.ix[:,'A1'] = np.nan + df1.ix[:, 'A1'] = np.nan _maybe_remove(store, 'df1') store.append('df1', df1[:10]) store.append('df1', df1[10:]) @@ -1011,7 +1033,7 @@ def test_append_some_nans(self): # 2nd column df2 = df.copy() - df2.ix[:,'A2'] = np.nan + df2.ix[:, 'A2'] = np.nan _maybe_remove(store, 'df2') store.append('df2', df2[:10]) store.append('df2', df2[10:]) @@ -1019,7 +1041,7 @@ def test_append_some_nans(self): # datetimes df3 = df.copy() - df3.ix[:,'E'] = np.nan + df3.ix[:, 'E'] = np.nan _maybe_remove(store, 'df3') store.append('df3', df3[:10]) store.append('df3', df3[10:]) @@ -1029,11 +1051,10 @@ def test_append_all_nans(self): with ensure_clean_store(self.path) as store: - df = DataFrame({'A1' : np.random.randn(20), - 'A2' : np.random.randn(20)}, + df = DataFrame({'A1': np.random.randn(20), + 'A2': np.random.randn(20)}, index=np.arange(20)) - df.ix[0:15,:] = np.nan - + df.ix[0:15, :] = np.nan # nan some entire rows (dropna=True) _maybe_remove(store, 'df') @@ -1048,25 +1069,25 @@ def test_append_all_nans(self): tm.assert_frame_equal(store['df2'], df) # tests the option io.hdf.dropna_table - pandas.set_option('io.hdf.dropna_table',False) + pandas.set_option('io.hdf.dropna_table', False) _maybe_remove(store, 'df3') store.append('df3', df[:10]) store.append('df3', df[10:]) tm.assert_frame_equal(store['df3'], df) - pandas.set_option('io.hdf.dropna_table',True) + pandas.set_option('io.hdf.dropna_table', True) _maybe_remove(store, 'df4') store.append('df4', df[:10]) store.append('df4', df[10:]) tm.assert_frame_equal(store['df4'], df[-4:]) # nan some entire rows (string are still written!) - df = DataFrame({'A1' : np.random.randn(20), - 'A2' : np.random.randn(20), - 'B' : 'foo', 'C' : 'bar'}, + df = DataFrame({'A1': np.random.randn(20), + 'A2': np.random.randn(20), + 'B': 'foo', 'C': 'bar'}, index=np.arange(20)) - df.ix[0:15,:] = np.nan + df.ix[0:15, :] = np.nan _maybe_remove(store, 'df') store.append('df', df[:10], dropna=True) @@ -1078,13 +1099,16 @@ def test_append_all_nans(self): store.append('df2', df[10:], dropna=False) tm.assert_frame_equal(store['df2'], df) - # nan some entire rows (but since we have dates they are still written!) - df = DataFrame({'A1' : np.random.randn(20), - 'A2' : np.random.randn(20), - 'B' : 'foo', 'C' : 'bar', 'D' : Timestamp("20010101"), 'E' : datetime.datetime(2001,1,2,0,0) }, + # nan some entire rows (but since we have dates they are still + # written!) + df = DataFrame({'A1': np.random.randn(20), + 'A2': np.random.randn(20), + 'B': 'foo', 'C': 'bar', + 'D': Timestamp("20010101"), + 'E': datetime.datetime(2001, 1, 2, 0, 0)}, index=np.arange(20)) - df.ix[0:15,:] = np.nan + df.ix[0:15, :] = np.nan _maybe_remove(store, 'df') store.append('df', df[:10], dropna=True) @@ -1098,25 +1122,27 @@ def test_append_all_nans(self): # Test to make sure defaults are to not drop. # Corresponding to Issue 9382 - df_with_missing = DataFrame({'col1':[0, np.nan, 2], 'col2':[1, np.nan, np.nan]}) + df_with_missing = DataFrame( + {'col1': [0, np.nan, 2], 'col2': [1, np.nan, np.nan]}) with ensure_clean_path(self.path) as path: - df_with_missing.to_hdf(path, 'df_with_missing', format = 'table') + df_with_missing.to_hdf(path, 'df_with_missing', format='table') reloaded = read_hdf(path, 'df_with_missing') tm.assert_frame_equal(df_with_missing, reloaded) - matrix = [[[np.nan, np.nan, np.nan],[1,np.nan,np.nan]], - [[np.nan, np.nan, np.nan], [np.nan,5,6]], - [[np.nan, np.nan, np.nan],[np.nan,3,np.nan]]] + matrix = [[[np.nan, np.nan, np.nan], [1, np.nan, np.nan]], + [[np.nan, np.nan, np.nan], [np.nan, 5, 6]], + [[np.nan, np.nan, np.nan], [np.nan, 3, np.nan]]] - panel_with_missing = Panel(matrix, items=['Item1', 'Item2','Item3'], - major_axis=[1,2], - minor_axis=['A', 'B', 'C']) + panel_with_missing = Panel(matrix, items=['Item1', 'Item2', 'Item3'], + major_axis=[1, 2], + minor_axis=['A', 'B', 'C']) with ensure_clean_path(self.path) as path: - panel_with_missing.to_hdf(path, 'panel_with_missing', format='table') - reloaded_panel = read_hdf(path, 'panel_with_missing') - tm.assert_panel_equal(panel_with_missing, reloaded_panel) + panel_with_missing.to_hdf( + path, 'panel_with_missing', format='table') + reloaded_panel = read_hdf(path, 'panel_with_missing') + tm.assert_panel_equal(panel_with_missing, reloaded_panel) def test_append_frame_column_oriented(self): @@ -1141,46 +1167,48 @@ def test_append_frame_column_oriented(self): # this isn't supported self.assertRaises(TypeError, store.select, 'df1', ( - 'columns=A', Term('index>df.index[4]'))) + 'columns=A', Term('index>df.index[4]'))) def test_append_with_different_block_ordering(self): - #GH 4096; using same frames, but different block orderings + # GH 4096; using same frames, but different block orderings with ensure_clean_store(self.path) as store: for i in range(10): - df = DataFrame(np.random.randn(10,2),columns=list('AB')) + df = DataFrame(np.random.randn(10, 2), columns=list('AB')) df['index'] = range(10) - df['index'] += i*10 - df['int64'] = Series([1]*len(df),dtype='int64') - df['int16'] = Series([1]*len(df),dtype='int16') + df['index'] += i * 10 + df['int64'] = Series([1] * len(df), dtype='int64') + df['int16'] = Series([1] * len(df), dtype='int16') if i % 2 == 0: del df['int64'] - df['int64'] = Series([1]*len(df),dtype='int64') + df['int64'] = Series([1] * len(df), dtype='int64') if i % 3 == 0: a = df.pop('A') df['A'] = a - df.set_index('index',inplace=True) + df.set_index('index', inplace=True) - store.append('df',df) + store.append('df', df) - # test a different ordering but with more fields (like invalid combinate) + # test a different ordering but with more fields (like invalid + # combinate) with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(10,2),columns=list('AB'), dtype='float64') - df['int64'] = Series([1]*len(df),dtype='int64') - df['int16'] = Series([1]*len(df),dtype='int16') - store.append('df',df) + df = DataFrame(np.random.randn(10, 2), + columns=list('AB'), dtype='float64') + df['int64'] = Series([1] * len(df), dtype='int64') + df['int16'] = Series([1] * len(df), dtype='int16') + store.append('df', df) # store additonal fields in different blocks - df['int16_2'] = Series([1]*len(df),dtype='int16') + df['int16_2'] = Series([1] * len(df), dtype='int16') self.assertRaises(ValueError, store.append, 'df', df) # store multile additonal fields in different blocks - df['float_3'] = Series([1.]*len(df),dtype='float64') + df['float_3'] = Series([1.] * len(df), dtype='float64') self.assertRaises(ValueError, store.append, 'df', df) def test_ndim_indexables(self): @@ -1208,14 +1236,14 @@ def check_indexers(key, indexers): _maybe_remove(store, 'p4d') store.append('p4d', p4d.ix[:, :, :10, :], axes=indexers) store.append('p4d', p4d.ix[:, :, 10:, :], axes=[ - 'labels', 'items', 'major_axis']) + 'labels', 'items', 'major_axis']) assert_panel4d_equal(store.select('p4d'), p4d) check_indexers('p4d', indexers) # pass incorrect number of axes _maybe_remove(store, 'p4d') self.assertRaises(ValueError, store.append, 'p4d', p4d.ix[ - :, :, :10, :], axes=['major_axis', 'minor_axis']) + :, :, :10, :], axes=['major_axis', 'minor_axis']) # different than default indexables #1 indexers = ['labels', 'major_axis', 'minor_axis'] @@ -1240,14 +1268,14 @@ def check_indexers(key, indexers): # partial selection2 result = store.select('p4d', [Term( - 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) + 'labels=l1'), Term('items=ItemA'), Term('minor_axis=B')]) expected = p4d.reindex( labels=['l1'], items=['ItemA'], minor_axis=['B']) assert_panel4d_equal(result, expected) # non-existant partial selection result = store.select('p4d', [Term( - 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) + 'labels=l1'), Term('items=Item1'), Term('minor_axis=B')]) expected = p4d.reindex(labels=['l1'], items=[], minor_axis=['B']) assert_panel4d_equal(result, expected) @@ -1258,8 +1286,9 @@ def test_append_with_strings(self): wp2 = wp.rename_axis( dict([(x, "%s_extra" % x) for x in wp.minor_axis]), axis=2) - def check_col(key,name,size): - self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + def check_col(key, name, size): + self.assertEqual(getattr(store.get_storer( + key).table.description, name).itemsize, size) store.append('s1', wp, min_itemsize=20) store.append('s1', wp2) @@ -1324,26 +1353,28 @@ def check_col(key,name,size): with ensure_clean_store(self.path) as store: - def check_col(key,name,size): - self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + def check_col(key, name, size): + self.assertEqual(getattr(store.get_storer( + key).table.description, name).itemsize, size) - df = DataFrame(dict(A = 'foo', B = 'bar'),index=range(10)) + df = DataFrame(dict(A='foo', B='bar'), index=range(10)) # a min_itemsize that creates a data_column _maybe_remove(store, 'df') - store.append('df', df, min_itemsize={'A' : 200 }) + store.append('df', df, min_itemsize={'A': 200}) check_col('df', 'A', 200) self.assertEqual(store.get_storer('df').data_columns, ['A']) # a min_itemsize that creates a data_column2 _maybe_remove(store, 'df') - store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + store.append('df', df, data_columns=['B'], min_itemsize={'A': 200}) check_col('df', 'A', 200) - self.assertEqual(store.get_storer('df').data_columns, ['B','A']) + self.assertEqual(store.get_storer('df').data_columns, ['B', 'A']) # a min_itemsize that creates a data_column2 _maybe_remove(store, 'df') - store.append('df', df, data_columns = ['B'], min_itemsize={'values' : 200 }) + store.append('df', df, data_columns=[ + 'B'], min_itemsize={'values': 200}) check_col('df', 'B', 200) check_col('df', 'values_block_0', 200) self.assertEqual(store.get_storer('df').data_columns, ['B']) @@ -1355,15 +1386,17 @@ def check_col(key,name,size): tm.assert_frame_equal(store['df'], df) # invalid min_itemsize keys - df = DataFrame(['foo','foo','foo','barh','barh','barh'],columns=['A']) + df = DataFrame(['foo', 'foo', 'foo', 'barh', + 'barh', 'barh'], columns=['A']) _maybe_remove(store, 'df') - self.assertRaises(ValueError, store.append, 'df', df, min_itemsize={'foo' : 20, 'foobar' : 20}) + self.assertRaises(ValueError, store.append, 'df', + df, min_itemsize={'foo': 20, 'foobar': 20}) def test_append_with_data_columns(self): with ensure_clean_store(self.path) as store: df = tm.makeTimeDataFrame() - df.loc[:,'B'].iloc[0] = 1. + df.loc[:, 'B'].iloc[0] = 1. _maybe_remove(store, 'df') store.append('df', df[:2], data_columns=['B']) store.append('df', df[2:]) @@ -1388,8 +1421,8 @@ def test_append_with_data_columns(self): # data column selection with a string data_column df_new = df.copy() df_new['string'] = 'foo' - df_new.loc[1:4,'string'] = np.nan - df_new.loc[5:6,'string'] = 'bar' + df_new.loc[1:4, 'string'] = np.nan + df_new.loc[5:6, 'string'] = 'bar' _maybe_remove(store, 'df') store.append('df', df_new, data_columns=['string']) result = store.select('df', [Term('string=foo')]) @@ -1397,8 +1430,9 @@ def test_append_with_data_columns(self): tm.assert_frame_equal(result, expected) # using min_itemsize and a data column - def check_col(key,name,size): - self.assertEqual(getattr(store.get_storer(key).table.description,name).itemsize, size) + def check_col(key, name, size): + self.assertEqual(getattr(store.get_storer( + key).table.description, name).itemsize, size) with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df') @@ -1419,7 +1453,9 @@ def check_col(key,name,size): df_new['string_block1'] = 'foobarbah1' df_new['string_block2'] = 'foobarbah2' _maybe_remove(store, 'df') - store.append('df', df_new, data_columns=['string', 'string2'], min_itemsize={'string': 30, 'string2': 40, 'values': 50}) + store.append('df', df_new, data_columns=['string', 'string2'], + min_itemsize={'string': 30, 'string2': 40, + 'values': 50}) check_col('df', 'string', 30) check_col('df', 'string2', 40) check_col('df', 'values_block_1', 50) @@ -1427,28 +1463,28 @@ def check_col(key,name,size): with ensure_clean_store(self.path) as store: # multiple data columns df_new = df.copy() - df_new.ix[0,'A'] = 1. - df_new.ix[0,'B'] = -1. + df_new.ix[0, 'A'] = 1. + df_new.ix[0, 'B'] = -1. df_new['string'] = 'foo' - df_new.loc[1:4,'string'] = np.nan - df_new.loc[5:6,'string'] = 'bar' + df_new.loc[1:4, 'string'] = np.nan + df_new.loc[5:6, 'string'] = 'bar' df_new['string2'] = 'foo' - df_new.loc[2:5,'string2'] = np.nan - df_new.loc[7:8,'string2'] = 'bar' + df_new.loc[2:5, 'string2'] = np.nan + df_new.loc[7:8, 'string2'] = 'bar' _maybe_remove(store, 'df') store.append( 'df', df_new, data_columns=['A', 'B', 'string', 'string2']) result = store.select('df', [Term('string=foo'), Term( - 'string2=foo'), Term('A>0'), Term('B<0')]) + 'string2=foo'), Term('A>0'), Term('B<0')]) expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] + df_new.string2 == 'foo') & (df_new.A > 0) & (df_new.B < 0)] tm.assert_frame_equal(result, expected, check_index_type=False) # yield an empty frame result = store.select('df', [Term('string=foo'), Term( - 'string2=cool')]) + 'string2=cool')]) expected = df_new[(df_new.string == 'foo') & ( - df_new.string2 == 'cool')] + df_new.string2 == 'cool')] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: @@ -1463,8 +1499,9 @@ def check_col(key,name,size): df_dc.ix[3:5, ['A', 'B', 'datetime']] = np.nan _maybe_remove(store, 'df_dc') - store.append('df_dc', df_dc, data_columns=['B', 'C', - 'string', 'string2', 'datetime']) + store.append('df_dc', df_dc, + data_columns=['B', 'C', 'string', + 'string2', 'datetime']) result = store.select('df_dc', [Term('B>0')]) expected = df_dc[df_dc.B > 0] @@ -1473,7 +1510,7 @@ def check_col(key,name,size): result = store.select( 'df_dc', ['B > 0', 'C > 0', 'string == foo']) expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & ( - df_dc.string == 'foo')] + df_dc.string == 'foo')] tm.assert_frame_equal(result, expected, check_index_type=False) with ensure_clean_store(self.path) as store: @@ -1483,21 +1520,24 @@ def check_col(key,name,size): df_dc = DataFrame(np.random.randn(8, 3), index=index, columns=['A', 'B', 'C']) df_dc['string'] = 'foo' - df_dc.ix[4:6,'string'] = np.nan - df_dc.ix[7:9,'string'] = 'bar' - df_dc.ix[:,['B','C']] = df_dc.ix[:,['B','C']].abs() + df_dc.ix[4:6, 'string'] = np.nan + df_dc.ix[7:9, 'string'] = 'bar' + df_dc.ix[:, ['B', 'C']] = df_dc.ix[:, ['B', 'C']].abs() df_dc['string2'] = 'cool' # on-disk operations - store.append('df_dc', df_dc, data_columns = ['B', 'C', 'string', 'string2']) + store.append('df_dc', df_dc, data_columns=[ + 'B', 'C', 'string', 'string2']) - result = store.select('df_dc', [ Term('B>0') ]) - expected = df_dc[df_dc.B>0] - tm.assert_frame_equal(result,expected) + result = store.select('df_dc', [Term('B>0')]) + expected = df_dc[df_dc.B > 0] + tm.assert_frame_equal(result, expected) - result = store.select('df_dc', ['B > 0', 'C > 0', 'string == "foo"']) - expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')] - tm.assert_frame_equal(result,expected) + result = store.select( + 'df_dc', ['B > 0', 'C > 0', 'string == "foo"']) + expected = df_dc[(df_dc.B > 0) & (df_dc.C > 0) & + (df_dc.string == 'foo')] + tm.assert_frame_equal(result, expected) with ensure_clean_store(self.path) as store: # panel @@ -1505,29 +1545,30 @@ def check_col(key,name,size): np.random.seed(1234) p = tm.makePanel() - store.append('p1',p) - tm.assert_panel_equal(store.select('p1'),p) + store.append('p1', p) + tm.assert_panel_equal(store.select('p1'), p) - store.append('p2',p,data_columns=True) - tm.assert_panel_equal(store.select('p2'),p) + store.append('p2', p, data_columns=True) + tm.assert_panel_equal(store.select('p2'), p) - result = store.select('p2',where='ItemA>0') + result = store.select('p2', where='ItemA>0') expected = p.to_frame() - expected = expected[expected['ItemA']>0] - tm.assert_frame_equal(result.to_frame(),expected) + expected = expected[expected['ItemA'] > 0] + tm.assert_frame_equal(result.to_frame(), expected) - result = store.select('p2',where='ItemA>0 & minor_axis=["A","B"]') + result = store.select('p2', where='ItemA>0 & minor_axis=["A","B"]') expected = p.to_frame() - expected = expected[expected['ItemA']>0] - expected = expected[expected.reset_index(level=['major']).index.isin(['A','B'])] - tm.assert_frame_equal(result.to_frame(),expected) + expected = expected[expected['ItemA'] > 0] + expected = expected[expected.reset_index( + level=['major']).index.isin(['A', 'B'])] + tm.assert_frame_equal(result.to_frame(), expected) def test_create_table_index(self): with ensure_clean_store(self.path) as store: - def col(t,column): - return getattr(store.get_storer(t).table.cols,column) + def col(t, column): + return getattr(store.get_storer(t).table.cols, column) # index=False wp = tm.makePanel() @@ -1607,15 +1648,15 @@ def test_append_hierarchical(self): tm.assert_frame_equal(result, df) # GH 3748 - result = store.select('mi',columns=['A','B']) - expected = df.reindex(columns=['A','B']) - tm.assert_frame_equal(result,expected) + result = store.select('mi', columns=['A', 'B']) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(result, expected) with ensure_clean_path('test.hdf') as path: - df.to_hdf(path,'df',format='table') - result = read_hdf(path,'df',columns=['A','B']) - expected = df.reindex(columns=['A','B']) - tm.assert_frame_equal(result,expected) + df.to_hdf(path, 'df', format='table') + result = read_hdf(path, 'df', columns=['A', 'B']) + expected = df.reindex(columns=['A', 'B']) + tm.assert_frame_equal(result, expected) def test_column_multiindex(self): # GH 4710 @@ -1658,7 +1699,7 @@ def test_column_multiindex(self): columns=Index(list('ABCD'), name='foo')) expected = df.copy() if isinstance(expected.index, RangeIndex): - expected.index = Int64Index(expected.index) + expected.index = Int64Index(expected.index) with ensure_clean_store(self.path) as store: @@ -1674,44 +1715,53 @@ def test_store_multiindex(self): with ensure_clean_store(self.path) as store: def make_index(names=None): - return MultiIndex.from_tuples([( datetime.datetime(2013,12,d), s, t) for d in range(1,3) for s in range(2) for t in range(3)], + return MultiIndex.from_tuples([(datetime.datetime(2013, 12, d), + s, t) + for d in range(1, 3) + for s in range(2) + for t in range(3)], names=names) - # no names _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index()) - store.append('df',df) - tm.assert_frame_equal(store.select('df'),df) + df = DataFrame(np.zeros((12, 2)), columns=[ + 'a', 'b'], index=make_index()) + store.append('df', df) + tm.assert_frame_equal(store.select('df'), df) # partial names _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date',None,None])) - store.append('df',df) - tm.assert_frame_equal(store.select('df'),df) + df = DataFrame(np.zeros((12, 2)), columns=[ + 'a', 'b'], index=make_index(['date', None, None])) + store.append('df', df) + tm.assert_frame_equal(store.select('df'), df) # series _maybe_remove(store, 's') s = Series(np.zeros(12), index=make_index(['date', None, None])) - store.append('s',s) - xp = Series(np.zeros(12), index=make_index(['date', 'level_1', 'level_2'])) + store.append('s', s) + xp = Series(np.zeros(12), index=make_index( + ['date', 'level_1', 'level_2'])) tm.assert_series_equal(store.select('s'), xp) # dup with column _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','a','t'])) - self.assertRaises(ValueError, store.append, 'df',df) + df = DataFrame(np.zeros((12, 2)), columns=[ + 'a', 'b'], index=make_index(['date', 'a', 't'])) + self.assertRaises(ValueError, store.append, 'df', df) # dup within level _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','date','date'])) - self.assertRaises(ValueError, store.append, 'df',df) + df = DataFrame(np.zeros((12, 2)), columns=['a', 'b'], + index=make_index(['date', 'date', 'date'])) + self.assertRaises(ValueError, store.append, 'df', df) # fully names _maybe_remove(store, 'df') - df = DataFrame(np.zeros((12,2)), columns=['a','b'], index=make_index(['date','s','t'])) - store.append('df',df) - tm.assert_frame_equal(store.select('df'),df) + df = DataFrame(np.zeros((12, 2)), columns=[ + 'a', 'b'], index=make_index(['date', 's', 't'])) + store.append('df', df) + tm.assert_frame_equal(store.select('df'), df) def test_select_columns_in_where(self): @@ -1734,23 +1784,25 @@ def test_select_columns_in_where(self): tm.assert_frame_equal(store.select('df', columns=['A']), expected) - tm.assert_frame_equal(store.select('df', where="columns=['A']"), expected) + tm.assert_frame_equal(store.select( + 'df', where="columns=['A']"), expected) # With a Series s = Series(np.random.randn(10), index=index, name='A') with ensure_clean_store(self.path) as store: store.put('s', s, format='table') - tm.assert_series_equal(store.select('s', where="columns=['A']"),s) + tm.assert_series_equal(store.select('s', where="columns=['A']"), s) def test_pass_spec_to_storer(self): df = tm.makeDataFrame() with ensure_clean_store(self.path) as store: - store.put('df',df) + store.put('df', df) self.assertRaises(TypeError, store.select, 'df', columns=['A']) - self.assertRaises(TypeError, store.select, 'df',where=[('columns=A')]) + self.assertRaises(TypeError, store.select, + 'df', where=[('columns=A')]) def test_append_misc(self): @@ -1758,13 +1810,13 @@ def test_append_misc(self): # unsuported data types for non-tables p4d = tm.makePanel4D() - self.assertRaises(TypeError, store.put,'p4d',p4d) + self.assertRaises(TypeError, store.put, 'p4d', p4d) # unsuported data types - self.assertRaises(TypeError, store.put,'abc',None) - self.assertRaises(TypeError, store.put,'abc','123') - self.assertRaises(TypeError, store.put,'abc',123) - self.assertRaises(TypeError, store.put,'abc',np.arange(5)) + self.assertRaises(TypeError, store.put, 'abc', None) + self.assertRaises(TypeError, store.put, 'abc', '123') + self.assertRaises(TypeError, store.put, 'abc', 123) + self.assertRaises(TypeError, store.put, 'abc', np.arange(5)) df = tm.makeDataFrame() store.append('df', df, chunksize=1) @@ -1778,18 +1830,18 @@ def test_append_misc(self): # more chunksize in append tests def check(obj, comparator): for c in [10, 200, 1000]: - with ensure_clean_store(self.path,mode='w') as store: + with ensure_clean_store(self.path, mode='w') as store: store.append('obj', obj, chunksize=c) result = store.select('obj') - comparator(result,obj) + comparator(result, obj) df = tm.makeDataFrame() df['string'] = 'foo' df['float322'] = 1. df['float322'] = df['float322'].astype('float32') - df['bool'] = df['float322'] > 0 - df['time1'] = Timestamp('20130101') - df['time2'] = Timestamp('20130102') + df['bool'] = df['float322'] > 0 + df['time1'] = Timestamp('20130101') + df['time2'] = Timestamp('20130102') check(df, tm.assert_frame_equal) p = tm.makePanel() @@ -1803,36 +1855,36 @@ def check(obj, comparator): # 0 len df_empty = DataFrame(columns=list('ABC')) - store.append('df',df_empty) - self.assertRaises(KeyError,store.select, 'df') + store.append('df', df_empty) + self.assertRaises(KeyError, store.select, 'df') # repeated append of 0/non-zero frames - df = DataFrame(np.random.rand(10,3),columns=list('ABC')) - store.append('df',df) - assert_frame_equal(store.select('df'),df) - store.append('df',df_empty) - assert_frame_equal(store.select('df'),df) + df = DataFrame(np.random.rand(10, 3), columns=list('ABC')) + store.append('df', df) + assert_frame_equal(store.select('df'), df) + store.append('df', df_empty) + assert_frame_equal(store.select('df'), df) # store df = DataFrame(columns=list('ABC')) - store.put('df2',df) - assert_frame_equal(store.select('df2'),df) + store.put('df2', df) + assert_frame_equal(store.select('df2'), df) # 0 len p_empty = Panel(items=list('ABC')) - store.append('p',p_empty) - self.assertRaises(KeyError,store.select, 'p') + store.append('p', p_empty) + self.assertRaises(KeyError, store.select, 'p') # repeated append of 0/non-zero frames - p = Panel(np.random.randn(3,4,5),items=list('ABC')) - store.append('p',p) - assert_panel_equal(store.select('p'),p) - store.append('p',p_empty) - assert_panel_equal(store.select('p'),p) + p = Panel(np.random.randn(3, 4, 5), items=list('ABC')) + store.append('p', p) + assert_panel_equal(store.select('p'), p) + store.append('p', p_empty) + assert_panel_equal(store.select('p'), p) # store - store.put('p2',p_empty) - assert_panel_equal(store.select('p2'),p_empty) + store.put('p2', p_empty) + assert_panel_equal(store.select('p2'), p_empty) def test_append_raise(self): @@ -1844,34 +1896,35 @@ def test_append_raise(self): df = tm.makeDataFrame() df['invalid'] = [['a']] * len(df) self.assertEqual(df.dtypes['invalid'], np.object_) - self.assertRaises(TypeError, store.append,'df',df) + self.assertRaises(TypeError, store.append, 'df', df) # multiple invalid columns df['invalid2'] = [['a']] * len(df) df['invalid3'] = [['a']] * len(df) - self.assertRaises(TypeError, store.append,'df',df) + self.assertRaises(TypeError, store.append, 'df', df) # datetime with embedded nans as object df = tm.makeDataFrame() - s = Series(datetime.datetime(2001,1,2),index=df.index) + s = Series(datetime.datetime(2001, 1, 2), index=df.index) s = s.astype(object) s[0:5] = np.nan df['invalid'] = s self.assertEqual(df.dtypes['invalid'], np.object_) - self.assertRaises(TypeError, store.append,'df', df) + self.assertRaises(TypeError, store.append, 'df', df) # directy ndarray - self.assertRaises(TypeError, store.append,'df',np.arange(10)) + self.assertRaises(TypeError, store.append, 'df', np.arange(10)) # series directly - self.assertRaises(TypeError, store.append,'df',Series(np.arange(10))) + self.assertRaises(TypeError, store.append, + 'df', Series(np.arange(10))) # appending an incompatbile table df = tm.makeDataFrame() - store.append('df',df) + store.append('df', df) df['foo'] = 'foo' - self.assertRaises(ValueError, store.append,'df',df) + self.assertRaises(ValueError, store.append, 'df', df) def test_table_index_incompatible_dtypes(self): df1 = DataFrame({'a': [1, 2, 3]}) @@ -1888,39 +1941,42 @@ def test_table_values_dtypes_roundtrip(self): with ensure_clean_store(self.path) as store: df1 = DataFrame({'a': [1, 2, 3]}, dtype='f8') store.append('df_f8', df1) - assert_series_equal(df1.dtypes,store['df_f8'].dtypes) + assert_series_equal(df1.dtypes, store['df_f8'].dtypes) df2 = DataFrame({'a': [1, 2, 3]}, dtype='i8') store.append('df_i8', df2) - assert_series_equal(df2.dtypes,store['df_i8'].dtypes) + assert_series_equal(df2.dtypes, store['df_i8'].dtypes) # incompatible dtype self.assertRaises(ValueError, store.append, 'df_i8', df1) - # check creation/storage/retrieval of float32 (a bit hacky to actually create them thought) - df1 = DataFrame(np.array([[1],[2],[3]],dtype='f4'),columns = ['A']) + # check creation/storage/retrieval of float32 (a bit hacky to + # actually create them thought) + df1 = DataFrame( + np.array([[1], [2], [3]], dtype='f4'), columns=['A']) store.append('df_f4', df1) - assert_series_equal(df1.dtypes,store['df_f4'].dtypes) + assert_series_equal(df1.dtypes, store['df_f4'].dtypes) assert df1.dtypes[0] == 'float32' # check with mixed dtypes - df1 = DataFrame(dict([ (c,Series(np.random.randn(5),dtype=c)) for c in - ['float32','float64','int32','int64','int16','int8'] ])) + df1 = DataFrame(dict([(c, Series(np.random.randn(5), dtype=c)) + for c in ['float32', 'float64', 'int32', + 'int64', 'int16', 'int8']])) df1['string'] = 'foo' df1['float322'] = 1. df1['float322'] = df1['float322'].astype('float32') - df1['bool'] = df1['float32'] > 0 - df1['time1'] = Timestamp('20130101') - df1['time2'] = Timestamp('20130102') + df1['bool'] = df1['float32'] > 0 + df1['time1'] = Timestamp('20130101') + df1['time2'] = Timestamp('20130102') store.append('df_mixed_dtypes1', df1) result = store.select('df_mixed_dtypes1').get_dtype_counts() - expected = Series({ 'float32' : 2, 'float64' : 1,'int32' : 1, 'bool' : 1, - 'int16' : 1, 'int8' : 1, 'int64' : 1, 'object' : 1, - 'datetime64[ns]' : 2}) + expected = Series({'float32': 2, 'float64': 1, 'int32': 1, + 'bool': 1, 'int16': 1, 'int8': 1, + 'int64': 1, 'object': 1, 'datetime64[ns]': 2}) result.sort() expected.sort() - tm.assert_series_equal(result,expected) + tm.assert_series_equal(result, expected) def test_table_mixed_dtypes(self): @@ -1982,7 +2038,7 @@ def test_unimplemented_dtypes_table_columns(self): if not compat.PY3: l.append(('unicode', u('\\u03c3'))) - ### currently not supported dtypes #### + # currently not supported dtypes #### for n, f in l: df = tm.makeDataFrame() df[n] = f @@ -2005,20 +2061,23 @@ def test_calendar_roundtrip_issue(self): # 8591 # doc example from tseries holiday section weekmask_egypt = 'Sun Mon Tue Wed Thu' - holidays = ['2012-05-01', datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')] - bday_egypt = pandas.offsets.CustomBusinessDay(holidays=holidays, weekmask=weekmask_egypt) + holidays = ['2012-05-01', + datetime.datetime(2013, 5, 1), np.datetime64('2014-05-01')] + bday_egypt = pandas.offsets.CustomBusinessDay( + holidays=holidays, weekmask=weekmask_egypt) dt = datetime.datetime(2013, 4, 30) dts = date_range(dt, periods=5, freq=bday_egypt) - s = (Series(dts.weekday, dts).map(Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) + s = (Series(dts.weekday, dts).map( + Series('Mon Tue Wed Thu Fri Sat Sun'.split()))) with ensure_clean_store(self.path) as store: - store.put('fixed',s) + store.put('fixed', s) result = store.select('fixed') assert_series_equal(result, s) - store.append('table',s) + store.append('table', s) result = store.select('table') assert_series_equal(result, s) @@ -2027,42 +2086,43 @@ def test_append_with_timedelta(self): # append timedelta from datetime import timedelta - df = DataFrame(dict(A = Timestamp('20130101'), B = [ Timestamp('20130101') + timedelta(days=i,seconds=10) for i in range(10) ])) - df['C'] = df['A']-df['B'] - df.ix[3:5,'C'] = np.nan + df = DataFrame(dict(A=Timestamp('20130101'), B=[Timestamp( + '20130101') + timedelta(days=i, seconds=10) for i in range(10)])) + df['C'] = df['A'] - df['B'] + df.ix[3:5, 'C'] = np.nan with ensure_clean_store(self.path) as store: # table _maybe_remove(store, 'df') - store.append('df',df,data_columns=True) + store.append('df', df, data_columns=True) result = store.select('df') - assert_frame_equal(result,df) + assert_frame_equal(result, df) - result = store.select('df',Term("C<100000")) - assert_frame_equal(result,df) + result = store.select('df', Term("C<100000")) + assert_frame_equal(result, df) - result = store.select('df',Term("C","<",-3*86400)) - assert_frame_equal(result,df.iloc[3:]) + result = store.select('df', Term("C", "<", -3 * 86400)) + assert_frame_equal(result, df.iloc[3:]) - result = store.select('df',"C<'-3D'") - assert_frame_equal(result,df.iloc[3:]) + result = store.select('df', "C<'-3D'") + assert_frame_equal(result, df.iloc[3:]) # a bit hacky here as we don't really deal with the NaT properly - result = store.select('df',"C<'-500000s'") + result = store.select('df', "C<'-500000s'") result = result.dropna(subset=['C']) - assert_frame_equal(result,df.iloc[6:]) + assert_frame_equal(result, df.iloc[6:]) - result = store.select('df',"C<'-3.5D'") + result = store.select('df', "C<'-3.5D'") result = result.iloc[1:] - assert_frame_equal(result,df.iloc[4:]) + assert_frame_equal(result, df.iloc[4:]) # fixed _maybe_remove(store, 'df2') - store.put('df2',df) + store.put('df2', df) result = store.select('df2') - assert_frame_equal(result,df) + assert_frame_equal(result, df) def test_remove(self): @@ -2148,9 +2208,9 @@ def test_remove_startstop(self): _maybe_remove(store, 'wp1') store.put('wp1', wp, format='t') n = store.remove('wp1', start=32) - self.assertTrue(n == 120-32) + self.assertTrue(n == 120 - 32) result = store.select('wp1') - expected = wp.reindex(major_axis=wp.major_axis[:32//4]) + expected = wp.reindex(major_axis=wp.major_axis[:32 // 4]) assert_panel_equal(result, expected) _maybe_remove(store, 'wp2') @@ -2158,7 +2218,7 @@ def test_remove_startstop(self): n = store.remove('wp2', start=-32) self.assertTrue(n == 32) result = store.select('wp2') - expected = wp.reindex(major_axis=wp.major_axis[:-32//4]) + expected = wp.reindex(major_axis=wp.major_axis[:-32 // 4]) assert_panel_equal(result, expected) # stop @@ -2167,24 +2227,25 @@ def test_remove_startstop(self): n = store.remove('wp3', stop=32) self.assertTrue(n == 32) result = store.select('wp3') - expected = wp.reindex(major_axis=wp.major_axis[32//4:]) + expected = wp.reindex(major_axis=wp.major_axis[32 // 4:]) assert_panel_equal(result, expected) _maybe_remove(store, 'wp4') store.put('wp4', wp, format='t') n = store.remove('wp4', stop=-32) - self.assertTrue(n == 120-32) + self.assertTrue(n == 120 - 32) result = store.select('wp4') - expected = wp.reindex(major_axis=wp.major_axis[-32//4:]) + expected = wp.reindex(major_axis=wp.major_axis[-32 // 4:]) assert_panel_equal(result, expected) # start n stop _maybe_remove(store, 'wp5') store.put('wp5', wp, format='t') n = store.remove('wp5', start=16, stop=-16) - self.assertTrue(n == 120-32) + self.assertTrue(n == 120 - 32) result = store.select('wp5') - expected = wp.reindex(major_axis=wp.major_axis[:16//4].union(wp.major_axis[-16//4:])) + expected = wp.reindex(major_axis=wp.major_axis[ + :16 // 4].union(wp.major_axis[-16 // 4:])) assert_panel_equal(result, expected) _maybe_remove(store, 'wp6') @@ -2197,13 +2258,17 @@ def test_remove_startstop(self): # with where _maybe_remove(store, 'wp7') - date = wp.major_axis.take(np.arange(0,30,3)) + + # TODO: unused? + date = wp.major_axis.take(np.arange(0, 30, 3)) # noqa + crit = Term('major_axis=date') store.put('wp7', wp, format='t') n = store.remove('wp7', where=[crit], stop=80) self.assertTrue(n == 28) result = store.select('wp7') - expected = wp.reindex(major_axis=wp.major_axis.difference(wp.major_axis[np.arange(0,20,3)])) + expected = wp.reindex(major_axis=wp.major_axis.difference( + wp.major_axis[np.arange(0, 20, 3)])) assert_panel_equal(result, expected) def test_remove_crit(self): @@ -2256,16 +2321,18 @@ def test_remove_crit(self): crit2 = Term('major_axis=date2') store.remove('wp2', where=[crit2]) result = store['wp2'] - expected = wp.reindex( - major_axis=wp.major_axis.difference(date1).difference(Index([date2]))) + expected = wp.reindex(major_axis=wp.major_axis.difference(date1) + .difference(Index([date2]))) assert_panel_equal(result, expected) date3 = [wp.major_axis[7], wp.major_axis[9]] crit3 = Term('major_axis=date3') store.remove('wp2', where=[crit3]) result = store['wp2'] - expected = wp.reindex( - major_axis=wp.major_axis.difference(date1).difference(Index([date2])).difference(Index(date3))) + expected = wp.reindex(major_axis=wp.major_axis + .difference(date1) + .difference(Index([date2])) + .difference(Index(date3))) assert_panel_equal(result, expected) # corners @@ -2282,7 +2349,7 @@ def test_invalid_terms(self): df = tm.makeTimeDataFrame() df['string'] = 'foo' - df.ix[0:4,'string'] = 'bar' + df.ix[0:4, 'string'] = 'bar' wp = tm.makePanel() p4d = tm.makePanel4D() store.put('df', df, format='table') @@ -2290,31 +2357,39 @@ def test_invalid_terms(self): store.put('p4d', p4d, format='table') # some invalid terms - self.assertRaises(ValueError, store.select, 'wp', "minor=['A', 'B']") - self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114']"]) - self.assertRaises(ValueError, store.select, 'wp', ["index=['20121114', '20121114']"]) + self.assertRaises(ValueError, store.select, + 'wp', "minor=['A', 'B']") + self.assertRaises(ValueError, store.select, + 'wp', ["index=['20121114']"]) + self.assertRaises(ValueError, store.select, 'wp', [ + "index=['20121114', '20121114']"]) self.assertRaises(TypeError, Term) # more invalid - self.assertRaises(ValueError, store.select, 'df','df.index[3]') - self.assertRaises(SyntaxError, store.select, 'df','index>') - self.assertRaises(ValueError, store.select, 'wp', "major_axis<'20000108' & minor_axis['A', 'B']") + self.assertRaises(ValueError, store.select, 'df', 'df.index[3]') + self.assertRaises(SyntaxError, store.select, 'df', 'index>') + self.assertRaises(ValueError, store.select, 'wp', + "major_axis<'20000108' & minor_axis['A', 'B']") # from the docs with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) - dfq.to_hdf(path,'dfq',format='table',data_columns=True) + dfq = DataFrame(np.random.randn(10, 4), columns=list( + 'ABCD'), index=date_range('20130101', periods=10)) + dfq.to_hdf(path, 'dfq', format='table', data_columns=True) # check ok - read_hdf(path,'dfq',where="index>Timestamp('20130104') & columns=['A', 'B']") - read_hdf(path,'dfq',where="A>0 or C>0") + read_hdf(path, 'dfq', + where="index>Timestamp('20130104') & columns=['A', 'B']") + read_hdf(path, 'dfq', where="A>0 or C>0") # catch the invalid reference with ensure_clean_path(self.path) as path: - dfq = DataFrame(np.random.randn(10,4),columns=list('ABCD'),index=date_range('20130101',periods=10)) - dfq.to_hdf(path,'dfq',format='table') + dfq = DataFrame(np.random.randn(10, 4), columns=list( + 'ABCD'), index=date_range('20130101', periods=10)) + dfq.to_hdf(path, 'dfq', format='table') - self.assertRaises(ValueError, read_hdf, path,'dfq',where="A>0 or C>0") + self.assertRaises(ValueError, read_hdf, path, + 'dfq', where="A>0 or C>0") def test_terms(self): @@ -2322,7 +2397,8 @@ def test_terms(self): wp = tm.makePanel() p4d = tm.makePanel4D() - wpneg = Panel.fromDict({-1: tm.makeDataFrame(), 0: tm.makeDataFrame(), + wpneg = Panel.fromDict({-1: tm.makeDataFrame(), + 0: tm.makeDataFrame(), 1: tm.makeDataFrame()}) store.put('wp', wp, format='table') store.put('p4d', p4d, format='table') @@ -2330,13 +2406,13 @@ def test_terms(self): # panel result = store.select('wp', [Term( - 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) + 'major_axis<"20000108"'), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) assert_panel_equal(result, expected) # with deprecation result = store.select('wp', [Term( - 'major_axis','<',"20000108"), Term("minor_axis=['A', 'B']")]) + 'major_axis', '<', "20000108"), Term("minor_axis=['A', 'B']")]) expected = wp.truncate(after='20000108').reindex(minor=['A', 'B']) tm.assert_panel_equal(result, expected) @@ -2372,7 +2448,7 @@ def test_terms(self): ((("minor_axis==['A', 'B']"),),), (("items=['ItemA', 'ItemB']"),), ('items=ItemA'), - ] + ] for t in terms: store.select('wp', t) @@ -2382,13 +2458,15 @@ def test_terms(self): terms = [ (("labels=['l1', 'l2']"),), Term("labels=['l1', 'l2']"), - ] + ] for t in terms: store.select('p4d', t) - with tm.assertRaisesRegexp(TypeError, 'Only named functions are supported'): - store.select('wp', Term('major_axis == (lambda x: x)("20130101")')) + with tm.assertRaisesRegexp(TypeError, + 'Only named functions are supported'): + store.select('wp', Term( + 'major_axis == (lambda x: x)("20130101")')) # check USub node parsing res = store.select('wpneg', Term('items == -1')) @@ -2405,16 +2483,17 @@ def test_term_compat(self): wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) - store.append('wp',wp) + store.append('wp', wp) result = store.select('wp', [Term('major_axis>20000102'), - Term('minor_axis', '=', ['A','B']) ]) - expected = wp.loc[:,wp.major_axis>Timestamp('20000102'),['A','B']] + Term('minor_axis', '=', ['A', 'B'])]) + expected = wp.loc[:, wp.major_axis > + Timestamp('20000102'), ['A', 'B']] assert_panel_equal(result, expected) store.remove('wp', Term('major_axis>20000103')) result = store.select('wp') - expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :] assert_panel_equal(result, expected) with ensure_clean_store(self.path) as store: @@ -2422,23 +2501,30 @@ def test_term_compat(self): wp = Panel(np.random.randn(2, 5, 4), items=['Item1', 'Item2'], major_axis=date_range('1/1/2000', periods=5), minor_axis=['A', 'B', 'C', 'D']) - store.append('wp',wp) + store.append('wp', wp) # stringified datetimes - result = store.select('wp', [Term('major_axis','>',datetime.datetime(2000,1,2))]) - expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + result = store.select( + 'wp', [Term('major_axis', '>', datetime.datetime(2000, 1, 2))]) + expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) - result = store.select('wp', [Term('major_axis','>',datetime.datetime(2000,1,2,0,0))]) - expected = wp.loc[:,wp.major_axis>Timestamp('20000102')] + result = store.select( + 'wp', [Term('major_axis', '>', + datetime.datetime(2000, 1, 2, 0, 0))]) + expected = wp.loc[:, wp.major_axis > Timestamp('20000102')] assert_panel_equal(result, expected) - result = store.select('wp', [Term('major_axis','=',[datetime.datetime(2000,1,2,0,0),datetime.datetime(2000,1,3,0,0)])]) - expected = wp.loc[:,[Timestamp('20000102'),Timestamp('20000103')]] + result = store.select( + 'wp', [Term('major_axis', '=', + [datetime.datetime(2000, 1, 2, 0, 0), + datetime.datetime(2000, 1, 3, 0, 0)])]) + expected = wp.loc[:, [Timestamp('20000102'), + Timestamp('20000103')]] assert_panel_equal(result, expected) - result = store.select('wp', [Term('minor_axis','=',['A','B'])]) - expected = wp.loc[:,:,['A','B']] + result = store.select('wp', [Term('minor_axis', '=', ['A', 'B'])]) + expected = wp.loc[:, :, ['A', 'B']] assert_panel_equal(result, expected) def test_backwards_compat_without_term_object(self): @@ -2459,7 +2545,7 @@ def test_backwards_compat_without_term_object(self): store.remove('wp', ('major_axis>20000103')) result = store.select('wp') - expected = wp.loc[:,wp.major_axis<=Timestamp('20000103'),:] + expected = wp.loc[:, wp.major_axis <= Timestamp('20000103'), :] assert_panel_equal(result, expected) with ensure_clean_store(self.path) as store: @@ -2503,22 +2589,23 @@ def test_same_name_scoping(self): with ensure_clean_store(self.path) as store: import pandas as pd - df = DataFrame(np.random.randn(20, 2),index=pd.date_range('20130101',periods=20)) + df = DataFrame(np.random.randn(20, 2), + index=pd.date_range('20130101', periods=20)) store.put('df', df, format='table') - expected = df[df.index>pd.Timestamp('20130105')] + expected = df[df.index > pd.Timestamp('20130105')] - import datetime - result = store.select('df','index>datetime.datetime(2013,1,5)') - assert_frame_equal(result,expected) + import datetime # noqa + result = store.select('df', 'index>datetime.datetime(2013,1,5)') + assert_frame_equal(result, expected) - from datetime import datetime + from datetime import datetime # noqa # technically an error, but allow it - result = store.select('df','index>datetime.datetime(2013,1,5)') - assert_frame_equal(result,expected) + result = store.select('df', 'index>datetime.datetime(2013,1,5)') + assert_frame_equal(result, expected) - result = store.select('df','index>datetime(2013,1,5)') - assert_frame_equal(result,expected) + result = store.select('df', 'index>datetime(2013,1,5)') + assert_frame_equal(result, expected) def test_series(self): @@ -2533,7 +2620,8 @@ def test_series(self): ts3 = Series(ts.values, Index(np.asarray(ts.index, dtype=object), dtype=object)) - self._check_roundtrip(ts3, tm.assert_series_equal, check_index_type=False) + self._check_roundtrip(ts3, tm.assert_series_equal, + check_index_type=False) def test_sparse_series(self): @@ -2602,7 +2690,8 @@ def test_tuple_index(self): DF = DataFrame(data, index=idx, columns=col) expected_warning = Warning if PY35 else PerformanceWarning - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): self._check_roundtrip(DF, tm.assert_frame_equal) def test_index_types(self): @@ -2616,23 +2705,28 @@ def test_index_types(self): # nose has a deprecation warning in 3.5 expected_warning = Warning if PY35 else PerformanceWarning - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): ser = Series(values, [0, 'y']) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): ser = Series(values, [datetime.datetime.today(), 0]) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): ser = Series(values, ['y', 0]) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): ser = Series(values, [datetime.date.today(), 'a']) self._check_roundtrip(ser, func) - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): ser = Series(values, [1.23, 'b']) self._check_roundtrip(ser, func) @@ -2806,58 +2900,61 @@ def test_wide_table(self): def test_select_with_dups(self): # single dtypes - df = DataFrame(np.random.randn(10,4),columns=['A','A','B','B']) - df.index = date_range('20130101 9:30',periods=10,freq='T') + df = DataFrame(np.random.randn(10, 4), columns=['A', 'A', 'B', 'B']) + df.index = date_range('20130101 9:30', periods=10, freq='T') with ensure_clean_store(self.path) as store: - store.append('df',df) + store.append('df', df) result = store.select('df') expected = df - assert_frame_equal(result,expected,by_blocks=True) + assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df',columns=df.columns) + result = store.select('df', columns=df.columns) expected = df - assert_frame_equal(result,expected,by_blocks=True) + assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df',columns=['A']) - expected = df.loc[:,['A']] - assert_frame_equal(result,expected) + result = store.select('df', columns=['A']) + expected = df.loc[:, ['A']] + assert_frame_equal(result, expected) # dups accross dtypes - df = concat([DataFrame(np.random.randn(10,4),columns=['A','A','B','B']), - DataFrame(np.random.randint(0,10,size=20).reshape(10,2),columns=['A','C'])], + df = concat([DataFrame(np.random.randn(10, 4), + columns=['A', 'A', 'B', 'B']), + DataFrame(np.random.randint(0, 10, size=20) + .reshape(10, 2), + columns=['A', 'C'])], axis=1) - df.index = date_range('20130101 9:30',periods=10,freq='T') + df.index = date_range('20130101 9:30', periods=10, freq='T') with ensure_clean_store(self.path) as store: - store.append('df',df) + store.append('df', df) result = store.select('df') expected = df - assert_frame_equal(result,expected,by_blocks=True) + assert_frame_equal(result, expected, by_blocks=True) - result = store.select('df',columns=df.columns) + result = store.select('df', columns=df.columns) expected = df - assert_frame_equal(result,expected,by_blocks=True) + assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:,['A']] - result = store.select('df',columns=['A']) - assert_frame_equal(result,expected,by_blocks=True) + expected = df.loc[:, ['A']] + result = store.select('df', columns=['A']) + assert_frame_equal(result, expected, by_blocks=True) - expected = df.loc[:,['B','A']] - result = store.select('df',columns=['B','A']) - assert_frame_equal(result,expected,by_blocks=True) + expected = df.loc[:, ['B', 'A']] + result = store.select('df', columns=['B', 'A']) + assert_frame_equal(result, expected, by_blocks=True) # duplicates on both index and columns with ensure_clean_store(self.path) as store: - store.append('df',df) - store.append('df',df) + store.append('df', df) + store.append('df', df) - expected = df.loc[:,['B','A']] + expected = df.loc[:, ['B', 'A']] expected = concat([expected, expected]) - result = store.select('df',columns=['B','A']) - assert_frame_equal(result,expected,by_blocks=True) + result = store.select('df', columns=['B', 'A']) + assert_frame_equal(result, expected, by_blocks=True) def test_wide_table_dups(self): wp = tm.makePanel() @@ -2897,16 +2994,17 @@ def test_sparse_with_compression(self): # GH 2931 # make sparse dataframe - df = DataFrame(np.random.binomial(n=1, p=.01, size=(1e3, 10))).to_sparse(fill_value=0) + df = DataFrame(np.random.binomial( + n=1, p=.01, size=(1e3, 10))).to_sparse(fill_value=0) # case 1: store uncompressed self._check_double_roundtrip(df, tm.assert_frame_equal, - compression = False, + compression=False, check_frame_type=True) # case 2: store compressed (works) self._check_double_roundtrip(df, tm.assert_frame_equal, - compression = 'zlib', + compression='zlib', check_frame_type=True) # set one series to be completely sparse @@ -2914,12 +3012,13 @@ def test_sparse_with_compression(self): # case 3: store df with completely sparse series uncompressed self._check_double_roundtrip(df, tm.assert_frame_equal, - compression = False, + compression=False, check_frame_type=True) - # case 4: try storing df with completely sparse series compressed (fails) + # case 4: try storing df with completely sparse series compressed + # (fails) self._check_double_roundtrip(df, tm.assert_frame_equal, - compression = 'zlib', + compression='zlib', check_frame_type=True) def test_select(self): @@ -2938,9 +3037,10 @@ def test_select(self): store.select('wp2') # selection on the non-indexable with a large number of columns - wp = Panel( - np.random.randn(100, 100, 100), items=['Item%03d' % i for i in range(100)], - major_axis=date_range('1/1/2000', periods=100), minor_axis=['E%03d' % i for i in range(100)]) + wp = Panel(np.random.randn(100, 100, 100), + items=['Item%03d' % i for i in range(100)], + major_axis=date_range('1/1/2000', periods=100), + minor_axis=['E%03d' % i for i in range(100)]) _maybe_remove(store, 'wp') store.append('wp', wp) @@ -2990,9 +3090,10 @@ def test_select(self): def test_select_dtypes(self): with ensure_clean_store(self.path) as store: - # with a Timestamp data column (GH #2637) - df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300))) + df = DataFrame(dict( + ts=bdate_range('2012-01-01', periods=300), + A=np.random.randn(300))) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A']) @@ -3001,21 +3102,25 @@ def test_select_dtypes(self): tm.assert_frame_equal(expected, result) # bool columns (GH #2849) - df = DataFrame(np.random.randn(5,2), columns =['A','B']) + df = DataFrame(np.random.randn(5, 2), columns=['A', 'B']) df['object'] = 'foo' - df.ix[4:5,'object'] = 'bar' + df.ix[4:5, 'object'] = 'bar' df['boolv'] = df['A'] > 0 _maybe_remove(store, 'df') - store.append('df', df, data_columns = True) + store.append('df', df, data_columns=True) - expected = df[df.boolv == True].reindex(columns=['A','boolv']) - for v in [True,'true',1]: - result = store.select('df', Term('boolv == %s' % str(v)), columns = ['A','boolv']) + expected = (df[df.boolv == True] # noqa + .reindex(columns=['A', 'boolv'])) + for v in [True, 'true', 1]: + result = store.select('df', Term( + 'boolv == %s' % str(v)), columns=['A', 'boolv']) tm.assert_frame_equal(expected, result) - expected = df[df.boolv == False ].reindex(columns=['A','boolv']) - for v in [False,'false',0]: - result = store.select('df', Term('boolv == %s' % str(v)), columns = ['A','boolv']) + expected = (df[df.boolv == False] # noqa + .reindex(columns=['A', 'boolv'])) + for v in [False, 'false', 0]: + result = store.select('df', Term( + 'boolv == %s' % str(v)), columns=['A', 'boolv']) tm.assert_frame_equal(expected, result) # integer index @@ -3024,55 +3129,57 @@ def test_select_dtypes(self): store.append('df_int', df) result = store.select( 'df_int', [Term("index<10"), Term("columns=['A']")]) - expected = df.reindex(index=list(df.index)[0:10],columns=['A']) + expected = df.reindex(index=list(df.index)[0:10], columns=['A']) tm.assert_frame_equal(expected, result) # float index df = DataFrame(dict(A=np.random.rand( - 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) + 20), B=np.random.rand(20), index=np.arange(20, dtype='f8'))) _maybe_remove(store, 'df_float') store.append('df_float', df) result = store.select( 'df_float', [Term("index<10.0"), Term("columns=['A']")]) - expected = df.reindex(index=list(df.index)[0:10],columns=['A']) + expected = df.reindex(index=list(df.index)[0:10], columns=['A']) tm.assert_frame_equal(expected, result) with ensure_clean_store(self.path) as store: # floats w/o NaN - df = DataFrame(dict(cols = range(11), values = range(11)),dtype='float64') - df['cols'] = (df['cols']+10).apply(str) + df = DataFrame( + dict(cols=range(11), values=range(11)), dtype='float64') + df['cols'] = (df['cols'] + 10).apply(str) - store.append('df1',df,data_columns=True) + store.append('df1', df, data_columns=True) result = store.select( 'df1', where='values>2.0') - expected = df[df['values']>2.0] + expected = df[df['values'] > 2.0] tm.assert_frame_equal(expected, result) # floats with NaN df.iloc[0] = np.nan - expected = df[df['values']>2.0] + expected = df[df['values'] > 2.0] - store.append('df2',df,data_columns=True,index=False) + store.append('df2', df, data_columns=True, index=False) result = store.select( 'df2', where='values>2.0') tm.assert_frame_equal(expected, result) # https://github.com/PyTables/PyTables/issues/282 # bug in selection when 0th row has a np.nan and an index - #store.append('df3',df,data_columns=True) - #result = store.select( + # store.append('df3',df,data_columns=True) + # result = store.select( # 'df3', where='values>2.0') - #tm.assert_frame_equal(expected, result) + # tm.assert_frame_equal(expected, result) # not in first position float with NaN ok too - df = DataFrame(dict(cols = range(11), values = range(11)),dtype='float64') - df['cols'] = (df['cols']+10).apply(str) + df = DataFrame( + dict(cols=range(11), values=range(11)), dtype='float64') + df['cols'] = (df['cols'] + 10).apply(str) df.iloc[1] = np.nan - expected = df[df['values']>2.0] + expected = df[df['values'] > 2.0] - store.append('df4',df,data_columns=True) + store.append('df4', df, data_columns=True) result = store.select( 'df4', where='values>2.0') tm.assert_frame_equal(expected, result) @@ -3082,10 +3189,10 @@ def test_select_dtypes(self): with ensure_clean_store(self.path) as store: df = tm.makeDataFrame() - expected = df[df['A']>0] + expected = df[df['A'] > 0] store.append('df', df, data_columns=True) - np_zero = np.float64(0) + np_zero = np.float64(0) # noqa result = store.select('df', where=["A>np_zero"]) tm.assert_frame_equal(expected, result) @@ -3096,7 +3203,8 @@ def test_select_with_many_inputs(self): df = DataFrame(dict(ts=bdate_range('2012-01-01', periods=300), A=np.random.randn(300), B=range(300), - users = ['a']*50 + ['b']*50 + ['c']*100 + ['a%03d' % i for i in range(100)])) + users=['a'] * 50 + ['b'] * 50 + ['c'] * 100 + + ['a%03d' % i for i in range(100)])) _maybe_remove(store, 'df') store.append('df', df, data_columns=['ts', 'A', 'B', 'users']) @@ -3106,26 +3214,32 @@ def test_select_with_many_inputs(self): tm.assert_frame_equal(expected, result) # small selector - result = store.select('df', [Term("ts>=Timestamp('2012-02-01') & users=['a','b','c']")]) - expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(['a','b','c']) ] + result = store.select( + 'df', [Term("ts>=Timestamp('2012-02-01') & " + "users=['a','b','c']")]) + expected = df[(df.ts >= Timestamp('2012-02-01')) & + df.users.isin(['a', 'b', 'c'])] tm.assert_frame_equal(expected, result) # big selector along the columns - selector = [ 'a','b','c' ] + [ 'a%03d' % i for i in range(60) ] - result = store.select('df', [Term("ts>=Timestamp('2012-02-01')"),Term('users=selector')]) - expected = df[ (df.ts >= Timestamp('2012-02-01')) & df.users.isin(selector) ] + selector = ['a', 'b', 'c'] + ['a%03d' % i for i in range(60)] + result = store.select( + 'df', [Term("ts>=Timestamp('2012-02-01')"), + Term('users=selector')]) + expected = df[(df.ts >= Timestamp('2012-02-01')) & + df.users.isin(selector)] tm.assert_frame_equal(expected, result) - selector = range(100,200) + selector = range(100, 200) result = store.select('df', [Term('B=selector')]) - expected = df[ df.B.isin(selector) ] + expected = df[df.B.isin(selector)] tm.assert_frame_equal(expected, result) self.assertEqual(len(result), 100) # big selector along the index selector = Index(df.ts[0:100].values) - result = store.select('df', [Term('ts=selector')]) - expected = df[ df.ts.isin(selector.values) ] + result = store.select('df', [Term('ts=selector')]) + expected = df[df.ts.isin(selector.values)] tm.assert_frame_equal(expected, result) self.assertEqual(len(result), 100) @@ -3140,80 +3254,84 @@ def test_select_iterator(self): expected = store.select('df') - results = [ s for s in store.select('df',iterator=True) ] + results = [s for s in store.select('df', iterator=True)] result = concat(results) tm.assert_frame_equal(expected, result) - results = [ s for s in store.select('df',chunksize=100) ] + results = [s for s in store.select('df', chunksize=100)] self.assertEqual(len(results), 5) result = concat(results) tm.assert_frame_equal(expected, result) - results = [ s for s in store.select('df',chunksize=150) ] + results = [s for s in store.select('df', chunksize=150)] result = concat(results) tm.assert_frame_equal(result, expected) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path,'df_non_table') - self.assertRaises(TypeError, read_hdf, path,'df_non_table',chunksize=100) - self.assertRaises(TypeError, read_hdf, path,'df_non_table',iterator=True) + df.to_hdf(path, 'df_non_table') + self.assertRaises(TypeError, read_hdf, path, + 'df_non_table', chunksize=100) + self.assertRaises(TypeError, read_hdf, path, + 'df_non_table', iterator=True) with ensure_clean_path(self.path) as path: df = tm.makeTimeDataFrame(500) - df.to_hdf(path,'df',format='table') + df.to_hdf(path, 'df', format='table') - results = [ s for s in read_hdf(path,'df',chunksize=100) ] + results = [s for s in read_hdf(path, 'df', chunksize=100)] result = concat(results) self.assertEqual(len(results), 5) tm.assert_frame_equal(result, df) - tm.assert_frame_equal(result, read_hdf(path,'df')) + tm.assert_frame_equal(result, read_hdf(path, 'df')) # multiple with ensure_clean_store(self.path) as store: df1 = tm.makeTimeDataFrame(500) - store.append('df1',df1,data_columns=True) - df2 = tm.makeTimeDataFrame(500).rename(columns=lambda x: "%s_2" % x) + store.append('df1', df1, data_columns=True) + df2 = tm.makeTimeDataFrame(500).rename( + columns=lambda x: "%s_2" % x) df2['foo'] = 'bar' - store.append('df2',df2) + store.append('df2', df2) df = concat([df1, df2], axis=1) # full selection expected = store.select_as_multiple( ['df1', 'df2'], selector='df1') - results = [ s for s in store.select_as_multiple( - ['df1', 'df2'], selector='df1', chunksize=150) ] + results = [s for s in store.select_as_multiple( + ['df1', 'df2'], selector='df1', chunksize=150)] result = concat(results) tm.assert_frame_equal(expected, result) # where selection - #expected = store.select_as_multiple( + # expected = store.select_as_multiple( # ['df1', 'df2'], where= Term('A>0'), selector='df1') - #results = [] - #for s in store.select_as_multiple( - # ['df1', 'df2'], where= Term('A>0'), selector='df1', chunksize=25): + # results = [] + # for s in store.select_as_multiple( + # ['df1', 'df2'], where= Term('A>0'), selector='df1', + # chunksize=25): # results.append(s) - #result = concat(results) - #tm.assert_frame_equal(expected, result) + # result = concat(results) + # tm.assert_frame_equal(expected, result) def test_select_iterator_complete_8014(self): # GH 8014 # using iterator and where clause - chunksize=1e4 + chunksize = 1e4 # no iterator with ensure_clean_store(self.path) as store: expected = tm.makeTimeDataFrame(100064, 'S') _maybe_remove(store, 'df') - store.append('df',expected) + store.append('df', expected) beg_dt = expected.index[0] end_dt = expected.index[-1] @@ -3225,19 +3343,19 @@ def test_select_iterator_complete_8014(self): # select w/o iterator and where clause, single term, begin # of range, works where = "index >= '%s'" % beg_dt - result = store.select('df',where=where) + result = store.select('df', where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, single term, end # of range, works where = "index <= '%s'" % end_dt - result = store.select('df',where=where) + result = store.select('df', where=where) tm.assert_frame_equal(expected, result) # select w/o iterator and where clause, inclusive range, # works where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - result = store.select('df',where=where) + result = store.select('df', where=where) tm.assert_frame_equal(expected, result) # with iterator, full range @@ -3245,31 +3363,34 @@ def test_select_iterator_complete_8014(self): expected = tm.makeTimeDataFrame(100064, 'S') _maybe_remove(store, 'df') - store.append('df',expected) + store.append('df', expected) beg_dt = expected.index[0] end_dt = expected.index[-1] # select w/iterator and no where clause works - results = [ s for s in store.select('df',chunksize=chunksize) ] + results = [s for s in store.select('df', chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) tm.assert_frame_equal(expected, result) @@ -3277,37 +3398,41 @@ def test_select_iterator_non_complete_8014(self): # GH 8014 # using iterator and where clause - chunksize=1e4 + chunksize = 1e4 # with iterator, non complete range with ensure_clean_store(self.path) as store: expected = tm.makeTimeDataFrame(100064, 'S') _maybe_remove(store, 'df') - store.append('df',expected) + store.append('df', expected) beg_dt = expected.index[1] end_dt = expected.index[-2] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index <= end_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + rexpected = expected[(expected.index >= beg_dt) & + (expected.index <= end_dt)] tm.assert_frame_equal(rexpected, result) # with iterator, empty where @@ -3315,13 +3440,14 @@ def test_select_iterator_non_complete_8014(self): expected = tm.makeTimeDataFrame(100064, 'S') _maybe_remove(store, 'df') - store.append('df',expected) + store.append('df', expected) end_dt = expected.index[-1] # select w/iterator and where clause, single term, begin of range where = "index > '%s'" % end_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] self.assertEqual(0, len(results)) def test_select_iterator_many_empty_frames(self): @@ -3329,28 +3455,30 @@ def test_select_iterator_many_empty_frames(self): # GH 8014 # using iterator and where clause can return many empty # frames. - chunksize=int(1e4) + chunksize = int(1e4) # with iterator, range limited to the first chunk with ensure_clean_store(self.path) as store: expected = tm.makeTimeDataFrame(100000, 'S') _maybe_remove(store, 'df') - store.append('df',expected) + store.append('df', expected) beg_dt = expected.index[0] - end_dt = expected.index[chunksize-1] + end_dt = expected.index[chunksize - 1] # select w/iterator and where clause, single term, begin of range where = "index >= '%s'" % beg_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] result = concat(results) rexpected = expected[expected.index >= beg_dt] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause, single term, end of range where = "index <= '%s'" % end_dt - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] tm.assert_equal(1, len(results)) result = concat(results) @@ -3359,12 +3487,14 @@ def test_select_iterator_many_empty_frames(self): # select w/iterator and where clause, inclusive range where = "index >= '%s' & index <= '%s'" % (beg_dt, end_dt) - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] # should be 1, is 10 tm.assert_equal(1, len(results)) result = concat(results) - rexpected = expected[(expected.index >= beg_dt) & (expected.index <= end_dt)] + rexpected = expected[(expected.index >= beg_dt) & + (expected.index <= end_dt)] tm.assert_frame_equal(rexpected, result) # select w/iterator and where clause which selects @@ -3375,74 +3505,88 @@ def test_select_iterator_many_empty_frames(self): # True. where = "index <= '%s' & index >= '%s'" % (beg_dt, end_dt) - results = [ s for s in store.select('df',where=where,chunksize=chunksize) ] + results = [s for s in store.select( + 'df', where=where, chunksize=chunksize)] # should be [] tm.assert_equal(0, len(results)) - def test_retain_index_attributes(self): # GH 3499, losing frequency info on index recreation - df = DataFrame(dict(A = Series(lrange(3), - index=date_range('2000-1-1',periods=3,freq='H')))) + df = DataFrame(dict( + A=Series(lrange(3), + index=date_range('2000-1-1', periods=3, freq='H')))) with ensure_clean_store(self.path) as store: - _maybe_remove(store,'data') + _maybe_remove(store, 'data') store.put('data', df, format='table') result = store.get('data') - tm.assert_frame_equal(df,result) - - for attr in ['freq','tz','name']: - for idx in ['index','columns']: - self.assertEqual(getattr(getattr(df,idx),attr,None), - getattr(getattr(result,idx),attr,None)) + tm.assert_frame_equal(df, result) + for attr in ['freq', 'tz', 'name']: + for idx in ['index', 'columns']: + self.assertEqual(getattr(getattr(df, idx), attr, None), + getattr(getattr(result, idx), attr, None)) # try to append a table with a different frequency - with tm.assert_produces_warning(expected_warning=AttributeConflictWarning): - df2 = DataFrame(dict(A = Series(lrange(3), - index=date_range('2002-1-1',periods=3,freq='D')))) - store.append('data',df2) + with tm.assert_produces_warning( + expected_warning=AttributeConflictWarning): + df2 = DataFrame(dict( + A=Series(lrange(3), + index=date_range('2002-1-1', + periods=3, freq='D')))) + store.append('data', df2) self.assertIsNone(store.get_storer('data').info['index']['freq']) # this is ok - _maybe_remove(store,'df2') - df2 = DataFrame(dict(A = Series(lrange(3), - index=[Timestamp('20010101'),Timestamp('20010102'),Timestamp('20020101')]))) - store.append('df2',df2) - df3 = DataFrame(dict(A = Series(lrange(3),index=date_range('2002-1-1',periods=3,freq='D')))) - store.append('df2',df3) + _maybe_remove(store, 'df2') + df2 = DataFrame(dict( + A=Series(lrange(3), + index=[Timestamp('20010101'), Timestamp('20010102'), + Timestamp('20020101')]))) + store.append('df2', df2) + df3 = DataFrame(dict( + A=Series(lrange(3), + index=date_range('2002-1-1', periods=3, + freq='D')))) + store.append('df2', df3) def test_retain_index_attributes2(self): - with ensure_clean_path(self.path) as path: - expected_warning = Warning if PY35 else AttributeConflictWarning - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): - - df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H')))) - df.to_hdf(path,'data',mode='w',append=True) - df2 = DataFrame(dict(A = Series(lrange(3), index=date_range('2002-1-1',periods=3,freq='D')))) - df2.to_hdf(path,'data',append=True) - - idx = date_range('2000-1-1',periods=3,freq='H') + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): + + df = DataFrame(dict( + A=Series(lrange(3), + index=date_range('2000-1-1', + periods=3, freq='H')))) + df.to_hdf(path, 'data', mode='w', append=True) + df2 = DataFrame(dict( + A=Series(lrange(3), + index=date_range('2002-1-1', periods=3, + freq='D')))) + df2.to_hdf(path, 'data', append=True) + + idx = date_range('2000-1-1', periods=3, freq='H') idx.name = 'foo' - df = DataFrame(dict(A = Series(lrange(3), index=idx))) - df.to_hdf(path,'data',mode='w',append=True) + df = DataFrame(dict(A=Series(lrange(3), index=idx))) + df.to_hdf(path, 'data', mode='w', append=True) - self.assertEqual(read_hdf(path,'data').index.name, 'foo') + self.assertEqual(read_hdf(path, 'data').index.name, 'foo') - with tm.assert_produces_warning(expected_warning=expected_warning, check_stacklevel=False): + with tm.assert_produces_warning(expected_warning=expected_warning, + check_stacklevel=False): - idx2 = date_range('2001-1-1',periods=3,freq='H') + idx2 = date_range('2001-1-1', periods=3, freq='H') idx2.name = 'bar' - df2 = DataFrame(dict(A = Series(lrange(3), index=idx2))) - df2.to_hdf(path,'data',append=True) + df2 = DataFrame(dict(A=Series(lrange(3), index=idx2))) + df2.to_hdf(path, 'data', append=True) - self.assertIsNone(read_hdf(path,'data').index.name) + self.assertIsNone(read_hdf(path, 'data').index.name) def test_panel_select(self): @@ -3469,7 +3613,7 @@ def test_frame_select(self): df = tm.makeTimeDataFrame() with ensure_clean_store(self.path) as store: - store.put('frame', df,format='table') + store.put('frame', df, format='table') date = df.index[len(df) // 2] crit1 = Term('index>=date') @@ -3502,107 +3646,117 @@ def test_frame_select_complex(self): df = tm.makeTimeDataFrame() df['string'] = 'foo' - df.loc[df.index[0:4],'string'] = 'bar' + df.loc[df.index[0:4], 'string'] = 'bar' with ensure_clean_store(self.path) as store: store.put('df', df, format='table', data_columns=['string']) # empty result = store.select('df', 'index>df.index[3] & string="bar"') - expected = df.loc[(df.index>df.index[3]) & (df.string=='bar')] + expected = df.loc[(df.index > df.index[3]) & (df.string == 'bar')] tm.assert_frame_equal(result, expected) result = store.select('df', 'index>df.index[3] & string="foo"') - expected = df.loc[(df.index>df.index[3]) & (df.string=='foo')] + expected = df.loc[(df.index > df.index[3]) & (df.string == 'foo')] tm.assert_frame_equal(result, expected) # or result = store.select('df', 'index>df.index[3] | string="bar"') - expected = df.loc[(df.index>df.index[3]) | (df.string=='bar')] + expected = df.loc[(df.index > df.index[3]) | (df.string == 'bar')] tm.assert_frame_equal(result, expected) - result = store.select('df', '(index>df.index[3] & index<=df.index[6]) | string="bar"') - expected = df.loc[((df.index>df.index[3]) & (df.index<=df.index[6])) | (df.string=='bar')] + result = store.select('df', '(index>df.index[3] & ' + 'index<=df.index[6]) | string="bar"') + expected = df.loc[((df.index > df.index[3]) & ( + df.index <= df.index[6])) | (df.string == 'bar')] tm.assert_frame_equal(result, expected) # invert result = store.select('df', 'string!="bar"') - expected = df.loc[df.string!='bar'] + expected = df.loc[df.string != 'bar'] tm.assert_frame_equal(result, expected) # invert not implemented in numexpr :( - self.assertRaises(NotImplementedError, store.select, 'df', '~(string="bar")') + self.assertRaises(NotImplementedError, + store.select, 'df', '~(string="bar")') # invert ok for filters result = store.select('df', "~(columns=['A','B'])") - expected = df.loc[:,df.columns.difference(['A','B'])] + expected = df.loc[:, df.columns.difference(['A', 'B'])] tm.assert_frame_equal(result, expected) # in - result = store.select('df', "index>df.index[3] & columns in ['A','B']") - expected = df.loc[df.index>df.index[3]].reindex(columns=['A','B']) + result = store.select( + 'df', "index>df.index[3] & columns in ['A','B']") + expected = df.loc[df.index > df.index[3]].reindex(columns=[ + 'A', 'B']) tm.assert_frame_equal(result, expected) def test_frame_select_complex2(self): - with ensure_clean_path(['parms.hdf','hist.hdf']) as paths: + with ensure_clean_path(['parms.hdf', 'hist.hdf']) as paths: pp, hh = paths # use non-trivial selection criteria - parms = DataFrame({ 'A' : [1,1,2,2,3] }) - parms.to_hdf(pp,'df',mode='w',format='table',data_columns=['A']) + parms = DataFrame({'A': [1, 1, 2, 2, 3]}) + parms.to_hdf(pp, 'df', mode='w', + format='table', data_columns=['A']) - selection = read_hdf(pp,'df',where='A=[2,3]') - hist = DataFrame(np.random.randn(25,1),columns=['data'], - index=MultiIndex.from_tuples([ (i,j) for i in range(5) for j in range(5) ], - names=['l1','l2'])) + selection = read_hdf(pp, 'df', where='A=[2,3]') + hist = DataFrame(np.random.randn(25, 1), + columns=['data'], + index=MultiIndex.from_tuples( + [(i, j) for i in range(5) + for j in range(5)], + names=['l1', 'l2'])) - hist.to_hdf(hh,'df',mode='w',format='table') + hist.to_hdf(hh, 'df', mode='w', format='table') - expected = read_hdf(hh,'df',where=Term('l1','=',[2,3,4])) + expected = read_hdf(hh, 'df', where=Term('l1', '=', [2, 3, 4])) # list like - result = read_hdf(hh,'df',where=Term('l1','=',selection.index.tolist())) + result = read_hdf(hh, 'df', where=Term( + 'l1', '=', selection.index.tolist())) assert_frame_equal(result, expected) - l = selection.index.tolist() + l = selection.index.tolist() # noqa # sccope with list like store = HDFStore(hh) - result = store.select('df',where='l1=l') + result = store.select('df', where='l1=l') assert_frame_equal(result, expected) store.close() - result = read_hdf(hh,'df',where='l1=l') + result = read_hdf(hh, 'df', where='l1=l') assert_frame_equal(result, expected) # index - index = selection.index - result = read_hdf(hh,'df',where='l1=index') + index = selection.index # noqa + result = read_hdf(hh, 'df', where='l1=index') assert_frame_equal(result, expected) - result = read_hdf(hh,'df',where='l1=selection.index') + result = read_hdf(hh, 'df', where='l1=selection.index') assert_frame_equal(result, expected) - result = read_hdf(hh,'df',where='l1=selection.index.tolist()') + result = read_hdf(hh, 'df', where='l1=selection.index.tolist()') assert_frame_equal(result, expected) - result = read_hdf(hh,'df',where='l1=list(selection.index)') + result = read_hdf(hh, 'df', where='l1=list(selection.index)') assert_frame_equal(result, expected) # sccope with index store = HDFStore(hh) - result = store.select('df',where='l1=index') + result = store.select('df', where='l1=index') assert_frame_equal(result, expected) - result = store.select('df',where='l1=selection.index') + result = store.select('df', where='l1=selection.index') assert_frame_equal(result, expected) - result = store.select('df',where='l1=selection.index.tolist()') + result = store.select('df', where='l1=selection.index.tolist()') assert_frame_equal(result, expected) - result = store.select('df',where='l1=list(selection.index)') + result = store.select('df', where='l1=list(selection.index)') assert_frame_equal(result, expected) store.close() @@ -3617,10 +3771,12 @@ def test_invalid_filtering(self): store.put('df', df, format='table') # not implemented - self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A'] | columns=['B']") + self.assertRaises(NotImplementedError, store.select, + 'df', "columns=['A'] | columns=['B']") # in theory we could deal with this - self.assertRaises(NotImplementedError, store.select, 'df', "columns=['A','B'] & columns=['C']") + self.assertRaises(NotImplementedError, store.select, + 'df', "columns=['A','B'] & columns=['C']") def test_string_select(self): # GH 2973 @@ -3630,44 +3786,44 @@ def test_string_select(self): # test string ==/!= df['x'] = 'none' - df.ix[2:7,'x'] = '' + df.ix[2:7, 'x'] = '' - store.append('df',df,data_columns=['x']) + store.append('df', df, data_columns=['x']) - result = store.select('df',Term('x=none')) + result = store.select('df', Term('x=none')) expected = df[df.x == 'none'] - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) try: - result = store.select('df',Term('x!=none')) + result = store.select('df', Term('x!=none')) expected = df[df.x != 'none'] - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) except Exception as detail: com.pprint_thing("[{0}]".format(detail)) com.pprint_thing(store) com.pprint_thing(expected) df2 = df.copy() - df2.loc[df2.x=='','x'] = np.nan + df2.loc[df2.x == '', 'x'] = np.nan - store.append('df2',df2,data_columns=['x']) - result = store.select('df2',Term('x!=none')) + store.append('df2', df2, data_columns=['x']) + result = store.select('df2', Term('x!=none')) expected = df2[isnull(df2.x)] - assert_frame_equal(result,expected) + assert_frame_equal(result, expected) # int ==/!= df['int'] = 1 - df.ix[2:7,'int'] = 2 + df.ix[2:7, 'int'] = 2 - store.append('df3',df,data_columns=['int']) + store.append('df3', df, data_columns=['int']) - result = store.select('df3',Term('int=2')) - expected = df[df.int==2] - assert_frame_equal(result,expected) + result = store.select('df3', Term('int=2')) + expected = df[df.int == 2] + assert_frame_equal(result, expected) - result = store.select('df3',Term('int!=2')) - expected = df[df.int!=2] - assert_frame_equal(result,expected) + result = store.select('df3', Term('int!=2')) + expected = df[df.int != 2] + assert_frame_equal(result, expected) def test_read_column(self): @@ -3681,7 +3837,7 @@ def test_read_column(self): self.assertRaises(KeyError, store.select_column, 'df', 'foo') def f(): - store.select_column('df', 'index', where = ['index>5']) + store.select_column('df', 'index', where=['index>5']) self.assertRaises(Exception, f) # valid @@ -3734,7 +3890,6 @@ def f(): result = store.select_column('df4', 'B') tm.assert_series_equal(result, expected) - def test_coordinates(self): df = tm.makeTimeDataFrame() @@ -3745,7 +3900,7 @@ def test_coordinates(self): # all c = store.select_as_coordinates('df') - assert((c.values == np.arange(len(df.index))).all() == True) + assert((c.values == np.arange(len(df.index))).all()) # get coordinates back & test vs frame _maybe_remove(store, 'df') @@ -3753,13 +3908,13 @@ def test_coordinates(self): df = DataFrame(dict(A=lrange(5), B=lrange(5))) store.append('df', df) c = store.select_as_coordinates('df', ['index<3']) - assert((c.values == np.arange(3)).all() == True) + assert((c.values == np.arange(3)).all()) result = store.select('df', where=c) expected = df.ix[0:2, :] tm.assert_frame_equal(result, expected) c = store.select_as_coordinates('df', ['index>=3', 'index<=4']) - assert((c.values == np.arange(2) + 3).all() == True) + assert((c.values == np.arange(2) + 3).all()) result = store.select('df', where=c) expected = df.ix[3:4, :] tm.assert_frame_equal(result, expected) @@ -3785,50 +3940,55 @@ def test_coordinates(self): # pass array/mask as the coordinates with ensure_clean_store(self.path) as store: - df = DataFrame(np.random.randn(1000,2),index=date_range('20000101',periods=1000)) - store.append('df',df) - c = store.select_column('df','index') - where = c[DatetimeIndex(c).month==5].index + df = DataFrame(np.random.randn(1000, 2), + index=date_range('20000101', periods=1000)) + store.append('df', df) + c = store.select_column('df', 'index') + where = c[DatetimeIndex(c).month == 5].index expected = df.iloc[where] # locations - result = store.select('df',where=where) - tm.assert_frame_equal(result,expected) + result = store.select('df', where=where) + tm.assert_frame_equal(result, expected) # boolean - result = store.select('df',where=where) - tm.assert_frame_equal(result,expected) + result = store.select('df', where=where) + tm.assert_frame_equal(result, expected) # invalid - self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df),dtype='float64')) - self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)+1)) - self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5) - self.assertRaises(ValueError, store.select, 'df',where=np.arange(len(df)),start=5,stop=10) + self.assertRaises(ValueError, store.select, 'df', + where=np.arange(len(df), dtype='float64')) + self.assertRaises(ValueError, store.select, 'df', + where=np.arange(len(df) + 1)) + self.assertRaises(ValueError, store.select, 'df', + where=np.arange(len(df)), start=5) + self.assertRaises(ValueError, store.select, 'df', + where=np.arange(len(df)), start=5, stop=10) # selection with filter - selection = date_range('20000101',periods=500) + selection = date_range('20000101', periods=500) result = store.select('df', where='index in selection') expected = df[df.index.isin(selection)] - tm.assert_frame_equal(result,expected) + tm.assert_frame_equal(result, expected) # list - df = DataFrame(np.random.randn(10,2)) - store.append('df2',df) - result = store.select('df2',where=[0,3,5]) - expected = df.iloc[[0,3,5]] - tm.assert_frame_equal(result,expected) + df = DataFrame(np.random.randn(10, 2)) + store.append('df2', df) + result = store.select('df2', where=[0, 3, 5]) + expected = df.iloc[[0, 3, 5]] + tm.assert_frame_equal(result, expected) # boolean where = [True] * 10 where[-2] = False - result = store.select('df2',where=where) + result = store.select('df2', where=where) expected = df.loc[where] - tm.assert_frame_equal(result,expected) + tm.assert_frame_equal(result, expected) # start/stop result = store.select('df2', start=5, stop=10) expected = df[5:10] - tm.assert_frame_equal(result,expected) + tm.assert_frame_equal(result, expected) def test_append_to_multiple(self): df1 = tm.makeTimeDataFrame() @@ -3840,7 +4000,8 @@ def test_append_to_multiple(self): # exceptions self.assertRaises(ValueError, store.append_to_multiple, - {'df1': ['A', 'B'], 'df2': None}, df, selector='df3') + {'df1': ['A', 'B'], 'df2': None}, df, + selector='df3') self.assertRaises(ValueError, store.append_to_multiple, {'df1': None, 'df2': None}, df, selector='df3') self.assertRaises( @@ -3901,11 +4062,13 @@ def test_select_as_multiple(self): self.assertRaises(Exception, store.select_as_multiple, [None], where=['A>0', 'B>0'], selector='df1') self.assertRaises(KeyError, store.select_as_multiple, - ['df1','df3'], where=['A>0', 'B>0'], selector='df1') + ['df1', 'df3'], where=['A>0', 'B>0'], + selector='df1') self.assertRaises(KeyError, store.select_as_multiple, ['df3'], where=['A>0', 'B>0'], selector='df1') self.assertRaises(KeyError, store.select_as_multiple, - ['df1','df2'], where=['A>0', 'B>0'], selector='df4') + ['df1', 'df2'], where=['A>0', 'B>0'], + selector='df4') # default select result = store.select('df1', ['A>0', 'B>0']) @@ -3933,26 +4096,30 @@ def test_select_as_multiple(self): # test excpection for diff rows store.append('df3', tm.makeTimeDataFrame(nper=50)) self.assertRaises(ValueError, store.select_as_multiple, - ['df1','df3'], where=['A>0', 'B>0'], selector='df1') + ['df1', 'df3'], where=['A>0', 'B>0'], + selector='df1') def test_nan_selection_bug_4858(self): # GH 4858; nan selection bug, only works for pytables >= 3.1 if LooseVersion(tables.__version__) < '3.1.0': - raise nose.SkipTest('tables version does not support fix for nan selection bug: GH 4858') + raise nose.SkipTest('tables version does not support fix for nan ' + 'selection bug: GH 4858') with ensure_clean_store(self.path) as store: - df = DataFrame(dict(cols = range(6), values = range(6)), dtype='float64') - df['cols'] = (df['cols']+10).apply(str) + df = DataFrame(dict(cols=range(6), values=range(6)), + dtype='float64') + df['cols'] = (df['cols'] + 10).apply(str) df.iloc[0] = np.nan - expected = DataFrame(dict(cols = ['13.0','14.0','15.0'], values = [3.,4.,5.]), index=[3,4,5]) + expected = DataFrame(dict(cols=['13.0', '14.0', '15.0'], values=[ + 3., 4., 5.]), index=[3, 4, 5]) # write w/o the index on that particular column - store.append('df',df, data_columns=True,index=['cols']) - result = store.select('df',where='values>2.0') - assert_frame_equal(result,expected) + store.append('df', df, data_columns=True, index=['cols']) + result = store.select('df', where='values>2.0') + assert_frame_equal(result, expected) def test_start_stop(self): @@ -4031,7 +4198,7 @@ def test_multiple_open_close(self): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path,'df',mode='w',format='table') + df.to_hdf(path, 'df', mode='w', format='table') # single store = HDFStore(path) @@ -4047,6 +4214,7 @@ def test_multiple_open_close(self): # multiples store1 = HDFStore(path) + def f(): HDFStore(path) self.assertRaises(ValueError, f) @@ -4076,11 +4244,11 @@ def f(): self.assertFalse(store2.is_open) # nested close - store = HDFStore(path,mode='w') - store.append('df',df) + store = HDFStore(path, mode='w') + store.append('df', df) store2 = HDFStore(path) - store2.append('df2',df) + store2.append('df2', df) store2.close() self.assertIn('CLOSED', str(store2)) self.assertFalse(store2.is_open) @@ -4090,7 +4258,7 @@ def f(): self.assertFalse(store.is_open) # double closing - store = HDFStore(path,mode='w') + store = HDFStore(path, mode='w') store.append('df', df) store2 = HDFStore(path) @@ -4106,16 +4274,16 @@ def f(): with ensure_clean_path(self.path) as path: df = tm.makeDataFrame() - df.to_hdf(path,'df',mode='w',format='table') + df.to_hdf(path, 'df', mode='w', format='table') store = HDFStore(path) store.close() self.assertRaises(ClosedFileError, store.keys) - self.assertRaises(ClosedFileError, lambda : 'df' in store) - self.assertRaises(ClosedFileError, lambda : len(store)) - self.assertRaises(ClosedFileError, lambda : store['df']) - self.assertRaises(ClosedFileError, lambda : store.df) + self.assertRaises(ClosedFileError, lambda: 'df' in store) + self.assertRaises(ClosedFileError, lambda: len(store)) + self.assertRaises(ClosedFileError, lambda: store['df']) + self.assertRaises(ClosedFileError, lambda: store.df) self.assertRaises(ClosedFileError, store.select, 'df') self.assertRaises(ClosedFileError, store.get, 'df') self.assertRaises(ClosedFileError, store.append, 'df2', df) @@ -4129,7 +4297,9 @@ def f(): def test_pytables_native_read(self): - with ensure_clean_store(tm.get_data_path('legacy_hdf/pytables_native.h5'), mode='r') as store: + with ensure_clean_store( + tm.get_data_path('legacy_hdf/pytables_native.h5'), + mode='r') as store: d2 = store['detector/readout'] self.assertIsInstance(d2, DataFrame) @@ -4138,13 +4308,17 @@ def test_pytables_native2_read(self): if PY35 and is_platform_windows(): raise nose.SkipTest("native2 read fails oddly on windows / 3.5") - with ensure_clean_store(tm.get_data_path('legacy_hdf/pytables_native2.h5'), mode='r') as store: + with ensure_clean_store( + tm.get_data_path('legacy_hdf/pytables_native2.h5'), + mode='r') as store: str(store) d1 = store['detector'] self.assertIsInstance(d1, DataFrame) def test_legacy_read(self): - with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy.h5'), mode='r') as store: + with ensure_clean_store( + tm.get_data_path('legacy_hdf/legacy.h5'), + mode='r') as store: store['a'] store['b'] store['c'] @@ -4152,7 +4326,9 @@ def test_legacy_read(self): def test_legacy_table_read(self): # legacy table types - with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy_table.h5'), mode='r') as store: + with ensure_clean_store( + tm.get_data_path('legacy_hdf/legacy_table.h5'), + mode='r') as store: store.select('df1') store.select('df2') store.select('wp1') @@ -4161,7 +4337,8 @@ def test_legacy_table_read(self): store.select('df2', typ='legacy_frame') # old version warning - with tm.assert_produces_warning(expected_warning=IncompatibilityWarning): + with tm.assert_produces_warning( + expected_warning=IncompatibilityWarning): self.assertRaises( Exception, store.select, 'wp1', Term('minor_axis=B')) @@ -4172,7 +4349,9 @@ def test_legacy_table_read(self): def test_legacy_0_10_read(self): # legacy from 0.10 - with ensure_clean_store(tm.get_data_path('legacy_hdf/legacy_0.10.h5'), mode='r') as store: + with ensure_clean_store( + tm.get_data_path('legacy_hdf/legacy_0.10.h5'), + mode='r') as store: str(store) for k in store.keys(): store.select(k) @@ -4194,20 +4373,20 @@ def test_legacy_0_11_read(self): def test_copy(self): - def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): + def do_copy(f=None, new_f=None, keys=None, propindexes=True, **kwargs): try: if f is None: f = tm.get_data_path(os.path.join('legacy_hdf', 'legacy_0.10.h5')) - store = HDFStore(f, 'r') if new_f is None: import tempfile fd, new_f = tempfile.mkstemp() - tstore = store.copy(new_f, keys = keys, propindexes = propindexes, **kwargs) + tstore = store.copy( + new_f, keys=keys, propindexes=propindexes, **kwargs) # check keys if keys is None: @@ -4238,8 +4417,8 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): safe_remove(new_f) do_copy() - do_copy(keys = ['/a','/b','/df1_mixed']) - do_copy(propindexes = False) + do_copy(keys=['/a', '/b', '/df1_mixed']) + do_copy(propindexes=False) # new table df = tm.makeDataFrame() @@ -4247,17 +4426,18 @@ def do_copy(f = None, new_f = None, keys = None, propindexes = True, **kwargs): try: path = create_tempfile(self.path) st = HDFStore(path) - st.append('df', df, data_columns = ['A']) + st.append('df', df, data_columns=['A']) st.close() - do_copy(f = path) - do_copy(f = path, propindexes = False) + do_copy(f=path) + do_copy(f=path, propindexes=False) finally: safe_remove(path) def test_legacy_table_write(self): raise nose.SkipTest("cannot write legacy tables") - store = HDFStore(tm.get_data_path('legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') + store = HDFStore(tm.get_data_path( + 'legacy_hdf/legacy_table_%s.h5' % pandas.__version__), 'a') df = tm.makeDataFrame() wp = tm.makePanel() @@ -4271,8 +4451,8 @@ def test_legacy_table_write(self): columns=['A', 'B', 'C']) store.append('mi', df) - df = DataFrame(dict(A = 'foo', B = 'bar'),index=lrange(10)) - store.append('df', df, data_columns = ['B'], min_itemsize={'A' : 200 }) + df = DataFrame(dict(A='foo', B='bar'), index=lrange(10)) + store.append('df', df, data_columns=['B'], min_itemsize={'A': 200}) store.append('wp', wp) store.close() @@ -4330,13 +4510,13 @@ def test_tseries_indices_frame(self): def test_unicode_index(self): unicode_values = [u('\u03c3'), u('\u03c3\u03c3')] + def f(): s = Series(np.random.randn(len(unicode_values)), unicode_values) self._check_roundtrip(s, tm.assert_series_equal) compat_assert_produces_warning(PerformanceWarning, f) - def test_unicode_longer_encoded(self): # GH 11234 char = '\u0394' @@ -4384,7 +4564,8 @@ def test_append_with_diff_col_name_types_raises_value_error(self): store.append(name, d) def test_query_with_nested_special_character(self): - df = DataFrame({'a': ['a', 'a', 'c', 'b', 'test & test', 'c' , 'b', 'e'], + df = DataFrame({'a': ['a', 'a', 'c', 'b', + 'test & test', 'c', 'b', 'e'], 'b': [1, 2, 3, 4, 5, 6, 7, 8]}) expected = df[df.a == 'test & test'] with ensure_clean_store(self.path) as store: @@ -4398,38 +4579,40 @@ def test_categorical(self): # basic _maybe_remove(store, 's') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=False)) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ + 'a', 'b', 'c', 'd'], ordered=False)) store.append('s', s, format='table') result = store.select('s') tm.assert_series_equal(s, result) _maybe_remove(store, 's_ordered') - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'], ordered=True)) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ + 'a', 'b', 'c', 'd'], ordered=True)) store.append('s_ordered', s, format='table') result = store.select('s_ordered') tm.assert_series_equal(s, result) _maybe_remove(store, 'df') - df = DataFrame({"s":s, "vals":[1,2,3,4,5,6]}) + df = DataFrame({"s": s, "vals": [1, 2, 3, 4, 5, 6]}) store.append('df', df, format='table') result = store.select('df') tm.assert_frame_equal(result, df) # dtypes - s = Series([1,1,2,2,3,4,5]).astype('category') - store.append('si',s) + s = Series([1, 1, 2, 2, 3, 4, 5]).astype('category') + store.append('si', s) result = store.select('si') tm.assert_series_equal(result, s) - s = Series([1,1,np.nan,2,3,4,5]).astype('category') - store.append('si2',s) + s = Series([1, 1, np.nan, 2, 3, 4, 5]).astype('category') + store.append('si2', s) result = store.select('si2') tm.assert_series_equal(result, s) # multiple df2 = df.copy() df2['s2'] = Series(list('abcdefg')).astype('category') - store.append('df2',df2) + store.append('df2', df2) result = store.select('df2') tm.assert_frame_equal(result, df2) @@ -4439,55 +4622,59 @@ def test_categorical(self): self.assertTrue('/df2/meta/values_block_1/meta' in str(store)) # unordered - s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=['a','b','c','d'],ordered=False)) + s = Series(Categorical(['a', 'b', 'b', 'a', 'a', 'c'], categories=[ + 'a', 'b', 'c', 'd'], ordered=False)) store.append('s2', s, format='table') result = store.select('s2') tm.assert_series_equal(result, s) # query store.append('df3', df, data_columns=['s']) - expected = df[df.s.isin(['b','c'])] - result = store.select('df3', where = ['s in ["b","c"]']) + expected = df[df.s.isin(['b', 'c'])] + result = store.select('df3', where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) - expected = df[df.s.isin(['b','c'])] - result = store.select('df3', where = ['s = ["b","c"]']) + expected = df[df.s.isin(['b', 'c'])] + result = store.select('df3', where=['s = ["b","c"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(['d'])] - result = store.select('df3', where = ['s in ["d"]']) + result = store.select('df3', where=['s in ["d"]']) tm.assert_frame_equal(result, expected) expected = df[df.s.isin(['f'])] - result = store.select('df3', where = ['s in ["f"]']) + result = store.select('df3', where=['s in ["f"]']) tm.assert_frame_equal(result, expected) # appending with same categories is ok store.append('df3', df) - df = concat([df,df]) - expected = df[df.s.isin(['b','c'])] - result = store.select('df3', where = ['s in ["b","c"]']) + df = concat([df, df]) + expected = df[df.s.isin(['b', 'c'])] + result = store.select('df3', where=['s in ["b","c"]']) tm.assert_frame_equal(result, expected) # appending must have the same categories df3 = df.copy() df3['s'].cat.remove_unused_categories(inplace=True) - self.assertRaises(ValueError, lambda : store.append('df3', df3)) + self.assertRaises(ValueError, lambda: store.append('df3', df3)) # remove - # make sure meta data is removed (its a recursive removal so should be) + # make sure meta data is removed (its a recursive removal so should + # be) result = store.select('df3/meta/s/meta') self.assertIsNotNone(result) store.remove('df3') - self.assertRaises(KeyError, lambda : store.select('df3/meta/s/meta')) + self.assertRaises( + KeyError, lambda: store.select('df3/meta/s/meta')) def test_duplicate_column_name(self): df = DataFrame(columns=["a", "a"], data=[[0, 0]]) with ensure_clean_path(self.path) as path: - self.assertRaises(ValueError, df.to_hdf, path, 'df', format='fixed') + self.assertRaises(ValueError, df.to_hdf, + path, 'df', format='fixed') df.to_hdf(path, 'df', format='table') other = read_hdf(path, 'df') @@ -4498,7 +4685,7 @@ def test_duplicate_column_name(self): def test_round_trip_equals(self): # GH 9330 - df = DataFrame({"B": [1,2], "A": ["x","y"]}) + df = DataFrame({"B": [1, 2], "A": ["x", "y"]}) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table') @@ -4511,8 +4698,9 @@ def test_preserve_timedeltaindex_type(self): # GH9635 # Storing TimedeltaIndexed DataFrames in fixed stores did not preserve # the type of the index. - df = DataFrame(np.random.normal(size=(10,5))) - df.index = timedelta_range(start='0s',periods=10,freq='1s',name='example') + df = DataFrame(np.random.normal(size=(10, 5))) + df.index = timedelta_range( + start='0s', periods=10, freq='1s', name='example') with ensure_clean_store(self.path) as store: @@ -4530,7 +4718,7 @@ def test_colums_multiindex_modified(self): df.index.name = 'letters' df = df.set_index(keys='E', append=True) - data_columns = df.index.names+df.columns.tolist() + data_columns = df.index.names + df.columns.tolist() with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', mode='a', @@ -4539,7 +4727,7 @@ def test_colums_multiindex_modified(self): index=False) cols2load = list('BCD') cols2load_original = list(cols2load) - df_loaded = read_hdf(path, 'df', columns=cols2load) + df_loaded = read_hdf(path, 'df', columns=cols2load) # noqa self.assertTrue(cols2load_original == cols2load) def test_to_hdf_with_object_column_names(self): @@ -4547,10 +4735,10 @@ def test_to_hdf_with_object_column_names(self): # Writing HDF5 table format should only work for string-like # column types - types_should_fail = [ tm.makeIntIndex, tm.makeFloatIndex, - tm.makeDateIndex, tm.makeTimedeltaIndex, - tm.makePeriodIndex ] - types_should_run = [ tm.makeStringIndex, tm.makeCategoricalIndex ] + types_should_fail = [tm.makeIntIndex, tm.makeFloatIndex, + tm.makeDateIndex, tm.makeTimedeltaIndex, + tm.makePeriodIndex] + types_should_run = [tm.makeStringIndex, tm.makeCategoricalIndex] if compat.PY3: types_should_run.append(tm.makeUnicodeIndex) @@ -4560,18 +4748,19 @@ def test_to_hdf_with_object_column_names(self): for index in types_should_fail: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: - with self.assertRaises(ValueError, - msg="cannot have non-object label DataIndexableCol"): + with self.assertRaises( + ValueError, msg=("cannot have non-object label " + "DataIndexableCol")): df.to_hdf(path, 'df', format='table', data_columns=True) for index in types_should_run: df = DataFrame(np.random.randn(10, 2), columns=index(2)) with ensure_clean_path(self.path) as path: df.to_hdf(path, 'df', format='table', data_columns=True) - result = pd.read_hdf(path, 'df', where="index = [{0}]".format(df.index[0])) + result = pd.read_hdf( + path, 'df', where="index = [{0}]".format(df.index[0])) assert(len(result)) - def test_read_hdf_open_store(self): # GH10330 # No check for non-string path_or-buf, and no test of open store @@ -4625,8 +4814,10 @@ def test_invalid_complib(self): index=list('abcd'), columns=list('ABCDE')) with ensure_clean_path(self.path) as path: - self.assertRaises(ValueError, df.to_hdf, path, 'df', complib='blosc:zlib') + self.assertRaises(ValueError, df.to_hdf, path, + 'df', complib='blosc:zlib') # GH10443 + def test_read_nokey(self): df = DataFrame(np.random.rand(4, 5), index=list('abcd'), @@ -4641,6 +4832,7 @@ def test_read_nokey(self): class TestHDFComplexValues(Base): # GH10447 + def test_complex_fixed(self): df = DataFrame(np.random.rand(4, 5).astype(np.complex64), index=list('abcd'), @@ -4679,7 +4871,8 @@ def test_complex_table(self): assert_frame_equal(df, reread) def test_complex_mixed_fixed(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) + complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, + 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128) df = DataFrame({'A': [1, 2, 3, 4], @@ -4694,7 +4887,8 @@ def test_complex_mixed_fixed(self): assert_frame_equal(df, reread) def test_complex_mixed_table(self): - complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) + complex64 = np.array([1.0 + 1.0j, 1.0 + 1.0j, + 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex64) complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j], dtype=np.complex128) df = DataFrame({'A': [1, 2, 3, 4], @@ -4753,7 +4947,8 @@ def test_complex_indexing_error(self): 'C': complex128}, index=list('abcd')) with ensure_clean_store(self.path) as store: - self.assertRaises(TypeError, store.append, 'df', df, data_columns=['C']) + self.assertRaises(TypeError, store.append, + 'df', df, data_columns=['C']) def test_complex_series_error(self): complex128 = np.array([1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j, 1.0 + 1.0j]) @@ -4777,8 +4972,8 @@ def test_complex_append(self): result = store.select('df') assert_frame_equal(pd.concat([df, df], 0), result) -class TestTimezones(Base, tm.TestCase): +class TestTimezones(Base, tm.TestCase): def _compare_with_tz(self, a, b): tm.assert_frame_equal(a, b) @@ -4786,17 +4981,19 @@ def _compare_with_tz(self, a, b): # compare the zones on each element for c in a.columns: for i in a.index: - a_e = a.loc[i,c] - b_e = b.loc[i,c] + a_e = a.loc[i, c] + b_e = b.loc[i, c] if not (a_e == b_e and a_e.tz == b_e.tz): - raise AssertionError("invalid tz comparsion [%s] [%s]" % (a_e, b_e)) + raise AssertionError( + "invalid tz comparsion [%s] [%s]" % (a_e, b_e)) def test_append_with_timezones_dateutil(self): from datetime import timedelta tm._skip_if_no_dateutil() - # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows filename issues. + # use maybe_get_tz instead of dateutil.tz.gettz to handle the windows + # filename issues. from pandas.tslib import maybe_get_tz gettz = lambda x: maybe_get_tz('dateutil/' + x) @@ -4804,7 +5001,8 @@ def test_append_with_timezones_dateutil(self): with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=[ Timestamp('20130102 2:00:00', tz=gettz('US/Eastern')) + timedelta(hours=1) * i for i in range(5) ])) + df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', tz=gettz( + 'US/Eastern')) + timedelta(hours=1) * i for i in range(5)])) store.append('df_tz', df, data_columns=['A']) result = store['df_tz'] @@ -4818,13 +5016,20 @@ def test_append_with_timezones_dateutil(self): # ensure we include dates in DST and STD time here. _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130603', tz=gettz('US/Eastern'))), index=range(5)) + df = DataFrame(dict(A=Timestamp('20130102', + tz=gettz('US/Eastern')), + B=Timestamp('20130603', + tz=gettz('US/Eastern'))), + index=range(5)) store.append('df_tz', df) result = store['df_tz'] self._compare_with_tz(result, df) assert_frame_equal(result, df) - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('EET'))), index=range(5)) + df = DataFrame(dict(A=Timestamp('20130102', + tz=gettz('US/Eastern')), + B=Timestamp('20130102', tz=gettz('EET'))), + index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # this is ok @@ -4835,14 +5040,18 @@ def test_append_with_timezones_dateutil(self): assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A=Timestamp('20130102', tz=gettz('US/Eastern')), B=Timestamp('20130102', tz=gettz('CET'))), index=range(5)) + df = DataFrame(dict(A=Timestamp('20130102', + tz=gettz('US/Eastern')), + B=Timestamp('20130102', tz=gettz('CET'))), + index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A=Series(lrange(3), index=date_range('2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) + df = DataFrame(dict(A=Series(lrange(3), index=date_range( + '2000-1-1', periods=3, freq='H', tz=gettz('US/Eastern'))))) _maybe_remove(store, 'df') store.put('df', df) @@ -4862,52 +5071,63 @@ def test_append_with_timezones_pytz(self): with ensure_clean_store(self.path) as store: _maybe_remove(store, 'df_tz') - df = DataFrame(dict(A = [ Timestamp('20130102 2:00:00',tz='US/Eastern') + timedelta(hours=1)*i for i in range(5) ])) - store.append('df_tz',df,data_columns=['A']) + df = DataFrame(dict(A=[Timestamp('20130102 2:00:00', + tz='US/Eastern') + + timedelta(hours=1) * i + for i in range(5)])) + store.append('df_tz', df, data_columns=['A']) result = store['df_tz'] - self._compare_with_tz(result,df) - assert_frame_equal(result,df) + self._compare_with_tz(result, df) + assert_frame_equal(result, df) # select with tz aware - self._compare_with_tz(store.select('df_tz',where=Term('A>=df.A[3]')),df[df.A>=df.A[3]]) + self._compare_with_tz(store.select( + 'df_tz', where=Term('A>=df.A[3]')), df[df.A >= df.A[3]]) _maybe_remove(store, 'df_tz') # ensure we include dates in DST and STD time here. - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130603',tz='US/Eastern')),index=range(5)) - store.append('df_tz',df) + df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130603', tz='US/Eastern')), + index=range(5)) + store.append('df_tz', df) result = store['df_tz'] - self._compare_with_tz(result,df) - assert_frame_equal(result,df) + self._compare_with_tz(result, df) + assert_frame_equal(result, df) - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='EET')),index=range(5)) + df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130102', tz='EET')), + index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # this is ok _maybe_remove(store, 'df_tz') - store.append('df_tz',df,data_columns=['A','B']) + store.append('df_tz', df, data_columns=['A', 'B']) result = store['df_tz'] - self._compare_with_tz(result,df) - assert_frame_equal(result,df) + self._compare_with_tz(result, df) + assert_frame_equal(result, df) # can't append with diff timezone - df = DataFrame(dict(A = Timestamp('20130102',tz='US/Eastern'), B = Timestamp('20130102',tz='CET')),index=range(5)) + df = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130102', tz='CET')), + index=range(5)) self.assertRaises(ValueError, store.append, 'df_tz', df) # as index with ensure_clean_store(self.path) as store: # GH 4098 example - df = DataFrame(dict(A = Series(lrange(3), index=date_range('2000-1-1',periods=3,freq='H', tz='US/Eastern')))) + df = DataFrame(dict(A=Series(lrange(3), index=date_range( + '2000-1-1', periods=3, freq='H', tz='US/Eastern')))) _maybe_remove(store, 'df') - store.put('df',df) + store.put('df', df) result = store.select('df') - assert_frame_equal(result,df) + assert_frame_equal(result, df) _maybe_remove(store, 'df') - store.append('df',df) + store.append('df', df) result = store.select('df') - assert_frame_equal(result,df) + assert_frame_equal(result, df) def test_tseries_select_index_column(self): # GH7777 @@ -4954,10 +5174,10 @@ def test_timezones_fixed(self): # as data # GH11411 _maybe_remove(store, 'df') - df = DataFrame({'A' : rng, - 'B' : rng.tz_convert('UTC').tz_localize(None), - 'C' : rng.tz_convert('CET'), - 'D' : range(len(rng))}, index=rng) + df = DataFrame({'A': rng, + 'B': rng.tz_convert('UTC').tz_localize(None), + 'C': rng.tz_convert('CET'), + 'D': range(len(rng))}, index=rng) store['df'] = df result = store['df'] assert_frame_equal(result, df) @@ -4974,7 +5194,8 @@ def test_fixed_offset_tz(self): def test_store_timezone(self): # GH2852 - # issue storing datetime.date with a timezone as it resets when read back in a new timezone + # issue storing datetime.date with a timezone as it resets when read + # back in a new timezone import platform if platform.system() == "Windows": @@ -4987,8 +5208,8 @@ def test_store_timezone(self): # original method with ensure_clean_store(self.path) as store: - today = datetime.date(2013,9,10) - df = DataFrame([1,2,3], index = [today, today, today]) + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) store['obj1'] = df result = store['obj1'] assert_frame_equal(result, df) @@ -5003,7 +5224,7 @@ def setTZ(tz): except: pass else: - os.environ['TZ']=tz + os.environ['TZ'] = tz time.tzset() try: @@ -5011,8 +5232,8 @@ def setTZ(tz): with ensure_clean_store(self.path) as store: setTZ('EST5EDT') - today = datetime.date(2013,9,10) - df = DataFrame([1,2,3], index = [today, today, today]) + today = datetime.date(2013, 9, 10) + df = DataFrame([1, 2, 3], index=[today, today, today]) store['obj1'] = df setTZ('CST6CDT') @@ -5026,8 +5247,12 @@ def setTZ(tz): def test_legacy_datetimetz_object(self): # legacy from < 0.17.0 # 8260 - expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), B=Timestamp('20130603', tz='CET')), index=range(5)) - with ensure_clean_store(tm.get_data_path('legacy_hdf/datetimetz_object.h5'), mode='r') as store: + expected = DataFrame(dict(A=Timestamp('20130102', tz='US/Eastern'), + B=Timestamp('20130603', tz='CET')), + index=range(5)) + with ensure_clean_store( + tm.get_data_path('legacy_hdf/datetimetz_object.h5'), + mode='r') as store: result = store['df'] assert_frame_equal(result, expected) @@ -5039,13 +5264,14 @@ def test_dst_transitions(self): freq="H", ambiguous='infer') - for i in [times, times+pd.Timedelta('10min')]: + for i in [times, times + pd.Timedelta('10min')]: _maybe_remove(store, 'df') - df = DataFrame({'A' : range(len(i)), 'B' : i }, index=i) - store.append('df',df) + df = DataFrame({'A': range(len(i)), 'B': i}, index=i) + store.append('df', df) result = store.select('df') assert_frame_equal(result, df) + def _test_sort(obj): if isinstance(obj, DataFrame): return obj.reindex(sorted(obj.index)) diff --git a/pandas/io/tests/test_sas.py b/pandas/io/tests/test_sas.py index 08737bfb60086..bca3594f4b47c 100644 --- a/pandas/io/tests/test_sas.py +++ b/pandas/io/tests/test_sas.py @@ -1,6 +1,5 @@ import pandas as pd import pandas.util.testing as tm -from pandas import compat from pandas.io.sas import XportReader, read_sas import numpy as np import os @@ -9,6 +8,8 @@ # Numbers in a SAS xport file are always float64, so need to convert # before making comparisons. + + def numeric_as_float(data): for v in data.columns: if data[v].dtype is np.dtype('int64'): @@ -24,7 +25,6 @@ def setUp(self): self.file03 = os.path.join(self.dirpath, "DRXFCD_G.XPT") self.file04 = os.path.join(self.dirpath, "paxraw_d_short.xpt") - def test1_basic(self): # Tests with DEMO_G.XPT (all numeric file) @@ -50,7 +50,6 @@ def test1_basic(self): data = read_sas(self.file01) tm.assert_frame_equal(data, data_csv) - def test1_index(self): # Tests with DEMO_G.XPT using index (all numeric file) @@ -66,13 +65,14 @@ def test1_index(self): # Test incremental read with `read` method. reader = XportReader(self.file01, index="SEQN") data = reader.read(10) - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) + tm.assert_frame_equal(data, data_csv.iloc[ + 0:10, :], check_index_type=False) # Test incremental read with `get_chunk` method. reader = XportReader(self.file01, index="SEQN", chunksize=10) data = reader.get_chunk() - tm.assert_frame_equal(data, data_csv.iloc[0:10, :], check_index_type=False) - + tm.assert_frame_equal(data, data_csv.iloc[ + 0:10, :], check_index_type=False) def test1_incremental(self): # Test with DEMO_G.XPT, reading full file incrementally @@ -88,7 +88,6 @@ def test1_incremental(self): tm.assert_frame_equal(data, data_csv, check_index_type=False) - def test2(self): # Test with SSHSV1_A.XPT @@ -99,7 +98,6 @@ def test2(self): data = XportReader(self.file02).read() tm.assert_frame_equal(data, data_csv) - def test_multiple_types(self): # Test with DRXFCD_G.XPT (contains text and numeric variables) @@ -112,7 +110,6 @@ def test_multiple_types(self): data = read_sas(self.file03) tm.assert_frame_equal(data, data_csv) - def test_truncated_float_support(self): # Test with paxraw_d_short.xpt, a shortened version of: # http://wwwn.cdc.gov/Nchs/Nhanes/2005-2006/PAXRAW_D.ZIP diff --git a/pandas/io/tests/test_sql.py b/pandas/io/tests/test_sql.py index bfd1ac3f08ee8..455e27b70055d 100644 --- a/pandas/io/tests/test_sql.py +++ b/pandas/io/tests/test_sql.py @@ -6,12 +6,13 @@ - Tests for the public API (only tests with sqlite3) - `_TestSQLApi` base class - `TestSQLApi`: test the public API with sqlalchemy engine - - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI connection + - `TestSQLiteFallbackApi`: test the public API with a sqlite DBAPI + connection - Tests for the different SQL flavors (flavor specific type conversions) - Tests for the sqlalchemy mode: `_TestSQLAlchemy` is the base class with common methods, `_TestSQLAlchemyConn` tests the API with a SQLAlchemy - Connection object. The different tested flavors (sqlite3, MySQL, PostgreSQL) - derive from the base class + Connection object. The different tested flavors (sqlite3, MySQL, + PostgreSQL) derive from the base class - Tests for the fallback mode (`TestSQLiteFallback` and `TestMySQLLegacy`) """ @@ -141,7 +142,8 @@ VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s) """, 'fields': ( - 'TextCol', 'DateCol', 'DateColWithTz', 'IntDateCol', 'FloatCol', + 'TextCol', 'DateCol', 'DateColWithTz', + 'IntDateCol', 'FloatCol', 'IntCol', 'BoolCol', 'IntColWithNull', 'BoolColWithNull' ) }, @@ -174,6 +176,7 @@ class MixInBase(object): + def tearDown(self): for tbl in self._get_all_tables(): self.drop_table(tbl) @@ -181,9 +184,11 @@ def tearDown(self): class MySQLMixIn(MixInBase): + def drop_table(self, table_name): cur = self.conn.cursor() - cur.execute("DROP TABLE IF EXISTS %s" % sql._get_valid_mysql_name(table_name)) + cur.execute("DROP TABLE IF EXISTS %s" % + sql._get_valid_mysql_name(table_name)) self.conn.commit() def _get_all_tables(self): @@ -200,12 +205,15 @@ def _close_conn(self): class SQLiteMixIn(MixInBase): + def drop_table(self, table_name): - self.conn.execute("DROP TABLE IF EXISTS %s" % sql._get_valid_sqlite_name(table_name)) + self.conn.execute("DROP TABLE IF EXISTS %s" % + sql._get_valid_sqlite_name(table_name)) self.conn.commit() def _get_all_tables(self): - c = self.conn.execute("SELECT name FROM sqlite_master WHERE type='table'") + c = self.conn.execute( + "SELECT name FROM sqlite_master WHERE type='table'") return [table[0] for table in c.fetchall()] def _close_conn(self): @@ -213,6 +221,7 @@ def _close_conn(self): class SQLAlchemyMixIn(MixInBase): + def drop_table(self, table_name): sql.SQLDatabase(self.conn).drop_table(table_name) @@ -225,6 +234,7 @@ def _get_all_tables(self): def _close_conn(self): pass + class PandasSQLTest(unittest.TestCase): """ Base class with common private methods for SQLAlchemy and fallback cases. @@ -267,12 +277,14 @@ def _check_iris_loaded_frame(self, iris_frame): def _load_test1_data(self): columns = ['index', 'A', 'B', 'C', 'D'] data = [( - '2000-01-03 00:00:00', 0.980268513777, 3.68573087906, -0.364216805298, -1.15973806169), + '2000-01-03 00:00:00', 0.980268513777, 3.68573087906, + -0.364216805298, -1.15973806169), ('2000-01-04 00:00:00', 1.04791624281, - 0.0412318367011, -0.16181208307, 0.212549316967), ('2000-01-05 00:00:00', 0.498580885705, 0.731167677815, -0.537677223318, 1.34627041952), - ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543, 0.00364077397681, 0.67525259227)] + ('2000-01-06 00:00:00', 1.12020151869, 1.56762092543, + 0.00364077397681, 0.67525259227)] self.test_frame1 = DataFrame(data, columns=columns) @@ -281,7 +293,8 @@ def _load_test2_data(self): B=['asd', 'gsq', 'ylt', 'jkl'], C=[1.1, 3.1, 6.9, 5.3], D=[False, True, True, False], - E=['1990-11-22', '1991-10-26', '1993-11-26', '1995-12-12'])) + E=['1990-11-22', '1991-10-26', + '1993-11-26', '1995-12-12'])) df['E'] = to_datetime(df['E']) self.test_frame2 = df @@ -423,7 +436,8 @@ def _to_sql_append(self): def _roundtrip(self): self.drop_table('test_frame_roundtrip') self.pandasSQL.to_sql(self.test_frame1, 'test_frame_roundtrip') - result = self.pandasSQL.read_query('SELECT * FROM test_frame_roundtrip') + result = self.pandasSQL.read_query( + 'SELECT * FROM test_frame_roundtrip') result.set_index('level_0', inplace=True) # result.index.astype(int) @@ -439,11 +453,11 @@ def _execute_sql(self): tm.equalContents(row, [5.1, 3.5, 1.4, 0.2, 'Iris-setosa']) def _to_sql_save_index(self): - df = DataFrame.from_records([(1,2.1,'line1'), (2,1.5,'line2')], - columns=['A','B','C'], index=['A']) + df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], + columns=['A', 'B', 'C'], index=['A']) self.pandasSQL.to_sql(df, 'test_to_sql_saves_index') ix_cols = self._get_index_columns('test_to_sql_saves_index') - self.assertEqual(ix_cols, [['A',],]) + self.assertEqual(ix_cols, [['A', ], ]) def _transaction_test(self): self.pandasSQL.execute("CREATE TABLE test_trans (A INT, B TEXT)") @@ -468,8 +482,8 @@ def _transaction_test(self): self.assertEqual(len(res2), 1) -#------------------------------------------------------------------------------ -#--- Testing the public API +# ----------------------------------------------------------------------------- +# -- Testing the public API class _TestSQLApi(PandasSQLTest): @@ -477,9 +491,9 @@ class _TestSQLApi(PandasSQLTest): Base class to test the public API. From this two classes are derived to run these tests for both the - sqlalchemy mode (`TestSQLApi`) and the fallback mode (`TestSQLiteFallbackApi`). - These tests are run with sqlite3. Specific tests for the different - sql flavours are included in `_TestSQLAlchemy`. + sqlalchemy mode (`TestSQLApi`) and the fallback mode + (`TestSQLiteFallbackApi`). These tests are run with sqlite3. Specific + tests for the different sql flavours are included in `_TestSQLAlchemy`. Notes: flavor can always be passed even in SQLAlchemy mode, @@ -519,16 +533,19 @@ def test_legacy_read_frame(self): def test_to_sql(self): sql.to_sql(self.test_frame1, 'test_frame1', self.conn, flavor='sqlite') self.assertTrue( - sql.has_table('test_frame1', self.conn, flavor='sqlite'), 'Table not written to DB') + sql.has_table('test_frame1', self.conn, flavor='sqlite'), + 'Table not written to DB') def test_to_sql_fail(self): sql.to_sql(self.test_frame1, 'test_frame2', self.conn, flavor='sqlite', if_exists='fail') self.assertTrue( - sql.has_table('test_frame2', self.conn, flavor='sqlite'), 'Table not written to DB') + sql.has_table('test_frame2', self.conn, flavor='sqlite'), + 'Table not written to DB') self.assertRaises(ValueError, sql.to_sql, self.test_frame1, - 'test_frame2', self.conn, flavor='sqlite', if_exists='fail') + 'test_frame2', self.conn, flavor='sqlite', + if_exists='fail') def test_to_sql_replace(self): sql.to_sql(self.test_frame1, 'test_frame3', @@ -608,7 +625,7 @@ def test_roundtrip(self): def test_roundtrip_chunksize(self): sql.to_sql(self.test_frame1, 'test_frame_roundtrip', con=self.conn, - index=False, flavor='sqlite', chunksize=2) + index=False, flavor='sqlite', chunksize=2) result = sql.read_sql_query( 'SELECT * FROM test_frame_roundtrip', con=self.conn) @@ -668,14 +685,15 @@ def test_date_and_index(self): def test_timedelta(self): # see #6921 - df = to_timedelta(Series(['00:00:01', '00:00:03'], name='foo')).to_frame() + df = to_timedelta( + Series(['00:00:01', '00:00:03'], name='foo')).to_frame() with tm.assert_produces_warning(UserWarning): df.to_sql('test_timedelta', self.conn) result = sql.read_sql_query('SELECT * FROM test_timedelta', self.conn) tm.assert_series_equal(result['foo'], df['foo'].astype('int64')) def test_complex(self): - df = DataFrame({'a':[1+1j, 2j]}) + df = DataFrame({'a': [1 + 1j, 2j]}) # Complex data type should raise error self.assertRaises(ValueError, df.to_sql, 'test_complex', self.conn) @@ -711,7 +729,8 @@ def test_to_sql_index_label(self): def test_to_sql_index_label_multiindex(self): temp_frame = DataFrame({'col1': range(4)}, - index=MultiIndex.from_product([('A0', 'A1'), ('B0', 'B1')])) + index=MultiIndex.from_product( + [('A0', 'A1'), ('B0', 'B1')])) # no index name, defaults to 'level_0' and 'level_1' sql.to_sql(temp_frame, 'test_index_label', self.conn) @@ -747,12 +766,12 @@ def test_to_sql_index_label_multiindex(self): index_label='C') def test_multiindex_roundtrip(self): - df = DataFrame.from_records([(1,2.1,'line1'), (2,1.5,'line2')], - columns=['A','B','C'], index=['A','B']) + df = DataFrame.from_records([(1, 2.1, 'line1'), (2, 1.5, 'line2')], + columns=['A', 'B', 'C'], index=['A', 'B']) df.to_sql('test_multiindex_roundtrip', self.conn) result = sql.read_sql_query('SELECT * FROM test_multiindex_roundtrip', - self.conn, index_col=['A','B']) + self.conn, index_col=['A', 'B']) tm.assert_frame_equal(df, result, check_index_type=True) def test_integer_col_names(self): @@ -766,15 +785,15 @@ def test_get_schema(self): self.assertTrue('CREATE' in create_sql) def test_get_schema_dtypes(self): - float_frame = DataFrame({'a':[1.1,1.2], 'b':[2.1,2.2]}) + float_frame = DataFrame({'a': [1.1, 1.2], 'b': [2.1, 2.2]}) dtype = sqlalchemy.Integer if self.mode == 'sqlalchemy' else 'INTEGER' create_sql = sql.get_schema(float_frame, 'test', 'sqlite', - con=self.conn, dtype={'b':dtype}) + con=self.conn, dtype={'b': dtype}) self.assertTrue('CREATE' in create_sql) self.assertTrue('INTEGER' in create_sql) def test_get_schema_keys(self): - frame = DataFrame({'Col1':[1.1,1.2], 'Col2':[2.1,2.2]}) + frame = DataFrame({'Col1': [1.1, 1.2], 'Col2': [2.1, 2.2]}) create_sql = sql.get_schema(frame, 'test', 'sqlite', con=self.conn, keys='Col1') constraint_sentence = 'CONSTRAINT test_pk PRIMARY KEY ("Col1")' @@ -836,7 +855,7 @@ def test_categorical(self): def test_unicode_column_name(self): # GH 11431 - df = DataFrame([[1,2],[3,4]], columns = [u'\xe9',u'b']) + df = DataFrame([[1, 2], [3, 4]], columns=[u'\xe9', u'b']) df.to_sql('test_unicode', self.conn, index=False) @@ -874,12 +893,14 @@ def test_read_table_index_col(self): self.assertEqual(result.index.names, ["index"], "index_col not correctly set") - result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"]) + result = sql.read_sql_table( + 'test_frame', self.conn, index_col=["A", "B"]) self.assertEqual(result.index.names, ["A", "B"], "index_col not correctly set") - result = sql.read_sql_table('test_frame', self.conn, index_col=["A", "B"], - columns=["C", "D"]) + result = sql.read_sql_table('test_frame', self.conn, + index_col=["A", "B"], + columns=["C", "D"]) self.assertEqual(result.index.names, ["A", "B"], "index_col not correctly set") self.assertEqual(result.columns.tolist(), ["C", "D"], @@ -923,7 +944,8 @@ def test_warning_case_insensitive_table_name(self): # This should not trigger a Warning self.test_frame1.to_sql('CaseSensitive', self.conn) # Verify some things - self.assertEqual(len(w), 0, "Warning triggered for writing a table") + self.assertEqual( + len(w), 0, "Warning triggered for writing a table") def _get_index_columns(self, tbl_name): from sqlalchemy.engine import reflection @@ -939,13 +961,16 @@ def test_sqlalchemy_type_mapping(self): utc=True)}) db = sql.SQLDatabase(self.conn) table = sql.SQLTable("test_type", db, frame=df) - self.assertTrue(isinstance(table.table.c['time'].type, sqltypes.DateTime)) + self.assertTrue(isinstance( + table.table.c['time'].type, sqltypes.DateTime)) def test_to_sql_read_sql_with_database_uri(self): # Test read_sql and .to_sql method with a database URI (GH10654) test_frame1 = self.test_frame1 - #db_uri = 'sqlite:///:memory:' # raises sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near "iris": syntax error [SQL: 'iris'] + # db_uri = 'sqlite:///:memory:' # raises + # sqlalchemy.exc.OperationalError: (sqlite3.OperationalError) near + # "iris": syntax error [SQL: 'iris'] with tm.ensure_clean() as name: db_uri = 'sqlite:///' + name table = 'iris' @@ -962,19 +987,20 @@ def _make_iris_table_metadata(self): sa = sqlalchemy metadata = sa.MetaData() iris = sa.Table('iris', metadata, - sa.Column('SepalLength', sa.REAL), - sa.Column('SepalWidth', sa.REAL), - sa.Column('PetalLength', sa.REAL), - sa.Column('PetalWidth', sa.REAL), - sa.Column('Name', sa.TEXT) - ) + sa.Column('SepalLength', sa.REAL), + sa.Column('SepalWidth', sa.REAL), + sa.Column('PetalLength', sa.REAL), + sa.Column('PetalWidth', sa.REAL), + sa.Column('Name', sa.TEXT) + ) return iris def test_query_by_text_obj(self): # WIP : GH10846 name_text = sqlalchemy.text('select * from iris where name=:name') - iris_df = sql.read_sql(name_text, self.conn, params={'name': 'Iris-versicolor'}) + iris_df = sql.read_sql(name_text, self.conn, params={ + 'name': 'Iris-versicolor'}) all_names = set(iris_df['Name']) self.assertEqual(all_names, set(['Iris-versicolor'])) @@ -982,8 +1008,10 @@ def test_query_by_select_obj(self): # WIP : GH10846 iris = self._make_iris_table_metadata() - name_select = sqlalchemy.select([iris]).where(iris.c.Name == sqlalchemy.bindparam('name')) - iris_df = sql.read_sql(name_select, self.conn, params={'name': 'Iris-setosa'}) + name_select = sqlalchemy.select([iris]).where( + iris.c.Name == sqlalchemy.bindparam('name')) + iris_df = sql.read_sql(name_select, self.conn, + params={'name': 'Iris-setosa'}) all_names = set(iris_df['Name']) self.assertEqual(all_names, set(['Iris-setosa'])) @@ -1093,8 +1121,8 @@ def test_sqlite_type_mapping(self): "TIMESTAMP") -#------------------------------------------------------------------------------ -#--- Database flavor specific tests +# ----------------------------------------------------------------------------- +# -- Database flavor specific tests class _TestSQLAlchemy(SQLAlchemyMixIn, PandasSQLTest): @@ -1148,7 +1176,8 @@ def setup_connect(self): # to test if connection can be made: self.conn.connect() except sqlalchemy.exc.OperationalError: - raise nose.SkipTest("Can't connect to {0} server".format(self.flavor)) + raise nose.SkipTest( + "Can't connect to {0} server".format(self.flavor)) def test_aread_sql(self): self._read_sql_iris() @@ -1241,7 +1270,7 @@ def test_default_type_conversion(self): def test_bigint(self): # int64 should be converted to BigInteger, GH7433 - df = DataFrame(data={'i64':[2**62]}) + df = DataFrame(data={'i64': [2**62]}) df.to_sql('test_bigint', self.conn, index=False) result = sql.read_sql_table('test_bigint', self.conn) @@ -1265,50 +1294,64 @@ def check(col): # or datetime64[ns, UTC] if com.is_datetime64_dtype(col.dtype): - # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00" + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00')) - # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00" + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00')) elif com.is_datetime64tz_dtype(col.dtype): self.assertTrue(str(col.dt.tz) == 'UTC') - # "2000-01-01 00:00:00-08:00" should convert to "2000-01-01 08:00:00" - self.assertEqual(col[0], Timestamp('2000-01-01 08:00:00', tz='UTC')) + # "2000-01-01 00:00:00-08:00" should convert to + # "2000-01-01 08:00:00" + self.assertEqual(col[0], Timestamp( + '2000-01-01 08:00:00', tz='UTC')) - # "2000-06-01 00:00:00-07:00" should convert to "2000-06-01 07:00:00" - self.assertEqual(col[1], Timestamp('2000-06-01 07:00:00', tz='UTC')) + # "2000-06-01 00:00:00-07:00" should convert to + # "2000-06-01 07:00:00" + self.assertEqual(col[1], Timestamp( + '2000-06-01 07:00:00', tz='UTC')) else: - raise AssertionError("DateCol loaded with incorrect type -> {0}".format(col.dtype)) + raise AssertionError("DateCol loaded with incorrect type " + "-> {0}".format(col.dtype)) # GH11216 df = pd.read_sql_query("select * from types_test_data", self.conn) - if not hasattr(df,'DateColWithTz'): + if not hasattr(df, 'DateColWithTz'): raise nose.SkipTest("no column with datetime with time zone") # this is parsed on Travis (linux), but not on macosx for some reason - # even with the same versions of psycopg2 & sqlalchemy, possibly a Postgrsql server - # version difference + # even with the same versions of psycopg2 & sqlalchemy, possibly a + # Postgrsql server version difference col = df.DateColWithTz - self.assertTrue(com.is_object_dtype(col.dtype) or com.is_datetime64_dtype(col.dtype) \ - or com.is_datetime64tz_dtype(col.dtype), - "DateCol loaded with incorrect type -> {0}".format(col.dtype)) - - df = pd.read_sql_query("select * from types_test_data", self.conn, parse_dates=['DateColWithTz']) - if not hasattr(df,'DateColWithTz'): + self.assertTrue(com.is_object_dtype(col.dtype) or + com.is_datetime64_dtype(col.dtype) or + com.is_datetime64tz_dtype(col.dtype), + "DateCol loaded with incorrect type -> {0}" + .format(col.dtype)) + + df = pd.read_sql_query("select * from types_test_data", + self.conn, parse_dates=['DateColWithTz']) + if not hasattr(df, 'DateColWithTz'): raise nose.SkipTest("no column with datetime with time zone") check(df.DateColWithTz) df = pd.concat(list(pd.read_sql_query("select * from types_test_data", - self.conn,chunksize=1)),ignore_index=True) + self.conn, chunksize=1)), + ignore_index=True) col = df.DateColWithTz self.assertTrue(com.is_datetime64tz_dtype(col.dtype), - "DateCol loaded with incorrect type -> {0}".format(col.dtype)) + "DateCol loaded with incorrect type -> {0}" + .format(col.dtype)) self.assertTrue(str(col.dt.tz) == 'UTC') expected = sql.read_sql_table("types_test_data", self.conn) - tm.assert_series_equal(df.DateColWithTz, expected.DateColWithTz.astype('datetime64[ns, UTC]')) + tm.assert_series_equal(df.DateColWithTz, + expected.DateColWithTz + .astype('datetime64[ns, UTC]')) # xref #7139 # this might or might not be converted depending on the postgres driver @@ -1330,7 +1373,7 @@ def test_date_parsing(self): "DateCol loaded with incorrect type") df = sql.read_sql_table("types_test_data", self.conn, parse_dates={ - 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) + 'DateCol': {'format': '%Y-%m-%d %H:%M:%S'}}) self.assertTrue(issubclass(df.DateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") @@ -1344,8 +1387,8 @@ def test_date_parsing(self): self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") - df = sql.read_sql_table( - "types_test_data", self.conn, parse_dates={'IntDateCol': {'unit': 's'}}) + df = sql.read_sql_table("types_test_data", self.conn, + parse_dates={'IntDateCol': {'unit': 's'}}) self.assertTrue(issubclass(df.IntDateCol.dtype.type, np.datetime64), "IntDateCol loaded with incorrect type") @@ -1405,8 +1448,8 @@ def test_datetime_time(self): def test_mixed_dtype_insert(self): # see GH6509 - s1 = Series(2**25 + 1,dtype=np.int32) - s2 = Series(0.0,dtype=np.float32) + s1 = Series(2**25 + 1, dtype=np.int32) + s2 = Series(0.0, dtype=np.float32) df = DataFrame({'s1': s1, 's2': s2}) # write and read again @@ -1417,7 +1460,7 @@ def test_mixed_dtype_insert(self): def test_nan_numeric(self): # NaNs in numeric float column - df = DataFrame({'A':[0, 1, 2], 'B':[0.2, np.nan, 5.6]}) + df = DataFrame({'A': [0, 1, 2], 'B': [0.2, np.nan, 5.6]}) df.to_sql('test_nan', self.conn, index=False) # with read_table @@ -1430,7 +1473,7 @@ def test_nan_numeric(self): def test_nan_fullcolumn(self): # full NaN column (numeric float column) - df = DataFrame({'A':[0, 1, 2], 'B':[np.nan, np.nan, np.nan]}) + df = DataFrame({'A': [0, 1, 2], 'B': [np.nan, np.nan, np.nan]}) df.to_sql('test_nan', self.conn, index=False) # with read_table @@ -1445,7 +1488,7 @@ def test_nan_fullcolumn(self): def test_nan_string(self): # NaNs in string column - df = DataFrame({'A':[0, 1, 2], 'B':['a', 'b', np.nan]}) + df = DataFrame({'A': [0, 1, 2], 'B': ['a', 'b', np.nan]}) df.to_sql('test_nan', self.conn, index=False) # NaNs are coming back as None @@ -1485,7 +1528,8 @@ def test_get_schema_create_table(self): self.drop_table(tbl) self.conn.execute(create_sql) returned_df = sql.read_sql_table(tbl, self.conn) - tm.assert_frame_equal(returned_df, blank_test_df, check_index_type=False) + tm.assert_frame_equal(returned_df, blank_test_df, + check_index_type=False) self.drop_table(tbl) def test_dtype(self): @@ -1510,16 +1554,16 @@ def test_dtype(self): self.assertEqual(sqltype.length, 10) def test_notnull_dtype(self): - cols = {'Bool': Series([True,None]), + cols = {'Bool': Series([True, None]), 'Date': Series([datetime(2012, 5, 1), None]), - 'Int' : Series([1, None], dtype='object'), + 'Int': Series([1, None], dtype='object'), 'Float': Series([1.1, None]) - } + } df = DataFrame(cols) tbl = 'notnull_dtype_test' df.to_sql(tbl, self.conn) - returned_df = sql.read_sql_table(tbl, self.conn) + returned_df = sql.read_sql_table(tbl, self.conn) # noqa meta = sqlalchemy.schema.MetaData(bind=self.conn) meta.reflect() if self.flavor == 'mysql': @@ -1537,20 +1581,20 @@ def test_notnull_dtype(self): def test_double_precision(self): V = 1.23456789101112131415 - df = DataFrame({'f32':Series([V,], dtype='float32'), - 'f64':Series([V,], dtype='float64'), - 'f64_as_f32':Series([V,], dtype='float64'), - 'i32':Series([5,], dtype='int32'), - 'i64':Series([5,], dtype='int64'), + df = DataFrame({'f32': Series([V, ], dtype='float32'), + 'f64': Series([V, ], dtype='float64'), + 'f64_as_f32': Series([V, ], dtype='float64'), + 'i32': Series([5, ], dtype='int32'), + 'i64': Series([5, ], dtype='int64'), }) df.to_sql('test_dtypes', self.conn, index=False, if_exists='replace', - dtype={'f64_as_f32':sqlalchemy.Float(precision=23)}) + dtype={'f64_as_f32': sqlalchemy.Float(precision=23)}) res = sql.read_sql_table('test_dtypes', self.conn) # check precision of float64 - self.assertEqual(np.round(df['f64'].iloc[0],14), - np.round(res['f64'].iloc[0],14)) + self.assertEqual(np.round(df['f64'].iloc[0], 14), + np.round(res['f64'].iloc[0], 14)) # check sql types meta = sqlalchemy.schema.MetaData(bind=self.conn) @@ -1572,7 +1616,8 @@ def foo(connection): return sql.read_sql_query(query, con=connection) def bar(connection, data): - data.to_sql(name='test_foo_data', con=connection, if_exists='append') + data.to_sql(name='test_foo_data', + con=connection, if_exists='append') def main(connectable): with connectable.connect() as conn: @@ -1580,7 +1625,8 @@ def main(connectable): foo_data = conn.run_callable(foo) conn.run_callable(bar, foo_data) - DataFrame({'test_foo_data': [0, 1, 2]}).to_sql('test_foo_data', self.conn) + DataFrame({'test_foo_data': [0, 1, 2]}).to_sql( + 'test_foo_data', self.conn) main(self.conn) def test_temporary_table(self): @@ -1610,8 +1656,10 @@ class Temporary(Base): class _TestSQLAlchemyConn(_EngineToConnMixin, _TestSQLAlchemy): + def test_transactions(self): - raise nose.SkipTest("Nested transactions rollbacks don't work with Pandas") + raise nose.SkipTest( + "Nested transactions rollbacks don't work with Pandas") class _TestSQLiteAlchemy(object): @@ -1657,7 +1705,7 @@ def test_default_date_load(self): def test_bigint_warning(self): # test no warning for BIGINT (to support int64) is raised (GH7433) - df = DataFrame({'a':[1,2]}, dtype='int64') + df = DataFrame({'a': [1, 2]}, dtype='int64') df.to_sql('test_bigintwarning', self.conn, index=False) with warnings.catch_warnings(record=True) as w: @@ -1681,7 +1729,7 @@ def connect(cls): @classmethod def setup_driver(cls): try: - import pymysql + import pymysql # noqa cls.driver = 'pymysql' except ImportError: raise nose.SkipTest('pymysql not installed') @@ -1707,7 +1755,7 @@ def test_default_type_conversion(self): def test_read_procedure(self): # see GH7324. Although it is more an api test, it is added to the # mysql tests as sqlite does not have stored procedures - df = DataFrame({'a': [1, 2, 3], 'b':[0.1, 0.2, 0.3]}) + df = DataFrame({'a': [1, 2, 3], 'b': [0.1, 0.2, 0.3]}) df.to_sql('test_procedure', self.conn, index=False) proc = """DROP PROCEDURE IF EXISTS get_testdb; @@ -1721,7 +1769,7 @@ def test_read_procedure(self): connection = self.conn.connect() trans = connection.begin() try: - r1 = connection.execute(proc) + r1 = connection.execute(proc) # noqa trans.commit() except: trans.rollback() @@ -1750,14 +1798,16 @@ def connect(cls): @classmethod def setup_driver(cls): try: - import psycopg2 + import psycopg2 # noqa cls.driver = 'psycopg2' except ImportError: raise nose.SkipTest('psycopg2 not installed') def test_schema_support(self): - # only test this for postgresql (schema's not supported in mysql/sqlite) - df = DataFrame({'col1':[1, 2], 'col2':[0.1, 0.2], 'col3':['a', 'n']}) + # only test this for postgresql (schema's not supported in + # mysql/sqlite) + df = DataFrame({'col1': [1, 2], 'col2': [ + 0.1, 0.2], 'col3': ['a', 'n']}) # create a schema self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") @@ -1783,7 +1833,7 @@ def test_schema_support(self): self.assertRaises(ValueError, sql.read_sql_table, 'test_schema_other', self.conn, schema='public') - ## different if_exists options + # different if_exists options # create a schema self.conn.execute("DROP SCHEMA IF EXISTS other CASCADE;") @@ -1795,10 +1845,11 @@ def test_schema_support(self): if_exists='replace') df.to_sql('test_schema_other', self.conn, schema='other', index=False, if_exists='append') - res = sql.read_sql_table('test_schema_other', self.conn, schema='other') + res = sql.read_sql_table( + 'test_schema_other', self.conn, schema='other') tm.assert_frame_equal(concat([df, df], ignore_index=True), res) - ## specifying schema in user-provided meta + # specifying schema in user-provided meta # The schema won't be applied on another Connection # because of transactional schemas @@ -1807,12 +1858,16 @@ def test_schema_support(self): meta = sqlalchemy.MetaData(engine2, schema='other') pdsql = sql.SQLDatabase(engine2, meta=meta) pdsql.to_sql(df, 'test_schema_other2', index=False) - pdsql.to_sql(df, 'test_schema_other2', index=False, if_exists='replace') - pdsql.to_sql(df, 'test_schema_other2', index=False, if_exists='append') - res1 = sql.read_sql_table('test_schema_other2', self.conn, schema='other') + pdsql.to_sql(df, 'test_schema_other2', + index=False, if_exists='replace') + pdsql.to_sql(df, 'test_schema_other2', + index=False, if_exists='append') + res1 = sql.read_sql_table( + 'test_schema_other2', self.conn, schema='other') res2 = pdsql.read_table('test_schema_other2') tm.assert_frame_equal(res1, res2) + class TestMySQLAlchemy(_TestMySQLAlchemy, _TestSQLAlchemy): pass @@ -1837,8 +1892,8 @@ class TestSQLiteAlchemyConn(_TestSQLiteAlchemy, _TestSQLAlchemyConn): pass -#------------------------------------------------------------------------------ -#--- Test Sqlite / MySQL fallback +# ----------------------------------------------------------------------------- +# -- Test Sqlite / MySQL fallback class TestSQLiteFallback(SQLiteMixIn, PandasSQLTest): """ @@ -1961,9 +2016,11 @@ def test_dtype(self): df.to_sql('dtype_test2', self.conn, dtype={'B': 'STRING'}) # sqlite stores Boolean values as INTEGER - self.assertEqual(self._get_sqlite_column_type('dtype_test', 'B'), 'INTEGER') + self.assertEqual(self._get_sqlite_column_type( + 'dtype_test', 'B'), 'INTEGER') - self.assertEqual(self._get_sqlite_column_type('dtype_test2', 'B'), 'STRING') + self.assertEqual(self._get_sqlite_column_type( + 'dtype_test2', 'B'), 'STRING') self.assertRaises(ValueError, df.to_sql, 'error', self.conn, dtype={'B': bool}) @@ -1971,18 +2028,19 @@ def test_notnull_dtype(self): if self.flavor == 'mysql': raise nose.SkipTest('Not applicable to MySQL legacy') - cols = {'Bool': Series([True,None]), + cols = {'Bool': Series([True, None]), 'Date': Series([datetime(2012, 5, 1), None]), - 'Int' : Series([1, None], dtype='object'), + 'Int': Series([1, None], dtype='object'), 'Float': Series([1.1, None]) - } + } df = DataFrame(cols) tbl = 'notnull_dtype_test' df.to_sql(tbl, self.conn) self.assertEqual(self._get_sqlite_column_type(tbl, 'Bool'), 'INTEGER') - self.assertEqual(self._get_sqlite_column_type(tbl, 'Date'), 'TIMESTAMP') + self.assertEqual(self._get_sqlite_column_type( + tbl, 'Date'), 'TIMESTAMP') self.assertEqual(self._get_sqlite_column_type(tbl, 'Int'), 'INTEGER') self.assertEqual(self._get_sqlite_column_type(tbl, 'Float'), 'REAL') @@ -1992,17 +2050,18 @@ def test_illegal_names(self): # Raise error on blank self.assertRaises(ValueError, df.to_sql, "", self.conn, - flavor=self.flavor) + flavor=self.flavor) - for ndx, weird_name in enumerate(['test_weird_name]','test_weird_name[', - 'test_weird_name`','test_weird_name"', 'test_weird_name\'', - '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', - '99beginswithnumber', '12345', u'\xe9']): + for ndx, weird_name in enumerate( + ['test_weird_name]', 'test_weird_name[', + 'test_weird_name`', 'test_weird_name"', 'test_weird_name\'', + '_b.test_weird_name_01-30', '"_b.test_weird_name_01-30"', + '99beginswithnumber', '12345', u'\xe9']): df.to_sql(weird_name, self.conn, flavor=self.flavor) sql.table_exists(weird_name, self.conn) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', weird_name]) - c_tbl = 'test_weird_col_name%d'%ndx + c_tbl = 'test_weird_col_name%d' % ndx df2.to_sql(c_tbl, self.conn, flavor=self.flavor) sql.table_exists(c_tbl, self.conn) @@ -2022,7 +2081,8 @@ def setUpClass(cls): try: cls.connect() except cls.driver.err.OperationalError: - raise nose.SkipTest("{0} - can't connect to MySQL server".format(cls)) + raise nose.SkipTest( + "{0} - can't connect to MySQL server".format(cls)) @classmethod def setup_driver(cls): @@ -2034,7 +2094,8 @@ def setup_driver(cls): @classmethod def connect(cls): - return cls.driver.connect(host='127.0.0.1', user='root', passwd='', db='pandas_nosetest') + return cls.driver.connect(host='127.0.0.1', user='root', passwd='', + db='pandas_nosetest') def _count_rows(self, table_name): cur = self._get_exec() @@ -2072,14 +2133,15 @@ def _get_index_columns(self, tbl_name): ix_cols[ix_name].append(ix_col) return list(ix_cols.values()) - def test_to_sql_save_index(self): - self._to_sql_save_index() + # TODO: cruft? + # def test_to_sql_save_index(self): + # self._to_sql_save_index() - for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): - if ix_name not in ix_cols: - ix_cols[ix_name] = [] - ix_cols[ix_name].append(ix_col) - return ix_cols.values() + # for ix_name, ix_col in zip(ixs.Key_name, ixs.Column_name): + # if ix_name not in ix_cols: + # ix_cols[ix_name] = [] + # ix_cols[ix_name].append(ix_col) + # return ix_cols.values() def test_to_sql_save_index(self): self._to_sql_save_index() @@ -2088,27 +2150,31 @@ def test_illegal_names(self): df = DataFrame([[1, 2], [3, 4]], columns=['a', 'b']) # These tables and columns should be ok - for ndx, ok_name in enumerate(['99beginswithnumber','12345']): + for ndx, ok_name in enumerate(['99beginswithnumber', '12345']): df.to_sql(ok_name, self.conn, flavor=self.flavor, index=False, if_exists='replace') df2 = DataFrame([[1, 2], [3, 4]], columns=['a', ok_name]) - df2.to_sql('test_ok_col_name', self.conn, flavor=self.flavor, index=False, - if_exists='replace') + df2.to_sql('test_ok_col_name', self.conn, + flavor=self.flavor, index=False, + if_exists='replace') # For MySQL, these should raise ValueError - for ndx, illegal_name in enumerate(['test_illegal_name]','test_illegal_name[', - 'test_illegal_name`','test_illegal_name"', 'test_illegal_name\'', '']): + for ndx, illegal_name in enumerate( + ['test_illegal_name]', 'test_illegal_name[', + 'test_illegal_name`', 'test_illegal_name"', + 'test_illegal_name\'', '']): self.assertRaises(ValueError, df.to_sql, illegal_name, self.conn, - flavor=self.flavor, index=False) + flavor=self.flavor, index=False) df2 = DataFrame([[1, 2], [3, 4]], columns=['a', illegal_name]) - self.assertRaises(ValueError, df2.to_sql, 'test_illegal_col_name%d'%ndx, - self.conn, flavor=self.flavor, index=False) + self.assertRaises(ValueError, df2.to_sql, + 'test_illegal_col_name%d' % ndx, + self.conn, flavor=self.flavor, index=False) -#------------------------------------------------------------------------------ -#--- Old tests from 0.13.1 (before refactor using sqlalchemy) +# ----------------------------------------------------------------------------- +# -- Old tests from 0.13.1 (before refactor using sqlalchemy) _formatters = { @@ -2124,6 +2190,7 @@ def test_illegal_names(self): bool: lambda x: "'%s'" % x, } + def format_query(sql, *args): """ @@ -2138,9 +2205,10 @@ def format_query(sql, *args): return sql % tuple(processed_args) + def _skip_if_no_pymysql(): try: - import pymysql + import pymysql # noqa except ImportError: raise nose.SkipTest('pymysql not installed, skipping') @@ -2283,7 +2351,7 @@ def test_tquery(self): frame = tm.makeTimeDataFrame() sql.write_frame(frame, name='test_table', con=self.conn) result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name + expected = Series(frame.A.values, frame.index) # not to have name result = Series(result, frame.index) tm.assert_series_equal(result, expected) @@ -2318,27 +2386,29 @@ def test_uquery(self): def test_keyword_as_column_names(self): ''' ''' - df = DataFrame({'From':np.ones(5)}) - sql.write_frame(df, con = self.conn, name = 'testkeywords') + df = DataFrame({'From': np.ones(5)}) + sql.write_frame(df, con=self.conn, name='testkeywords') def test_onecolumn_of_integer(self): # GH 3628 # a column_of_integers dataframe should transfer well to sql - mono_df=DataFrame([1 , 2], columns=['c0']) - sql.write_frame(mono_df, con = self.conn, name = 'mono_df') + mono_df = DataFrame([1, 2], columns=['c0']) + sql.write_frame(mono_df, con=self.conn, name='mono_df') # computing the sum via sql - con_x=self.conn - the_sum=sum([my_c0[0] for my_c0 in con_x.execute("select * from mono_df")]) + con_x = self.conn + the_sum = sum([my_c0[0] + for my_c0 in con_x.execute("select * from mono_df")]) # it should not fail, and gives 3 ( Issue #3628 ) - self.assertEqual(the_sum , 3) + self.assertEqual(the_sum, 3) - result = sql.read_frame("select * from mono_df",con_x) - tm.assert_frame_equal(result,mono_df) + result = sql.read_frame("select * from mono_df", con_x) + tm.assert_frame_equal(result, mono_df) def test_if_exists(self): df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame({'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) + df_if_exists_2 = DataFrame( + {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) table_name = 'table_if_exists' sql_select = "SELECT * FROM %s" % table_name @@ -2412,12 +2482,12 @@ def setUpClass(cls): return try: pymysql.connect(read_default_group='pandas') - except pymysql.ProgrammingError as e: + except pymysql.ProgrammingError: raise nose.SkipTest( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - except pymysql.Error as e: + except pymysql.Error: raise nose.SkipTest( "Cannot connect to database. " "Create a group of connection parameters under the heading " @@ -2430,27 +2500,26 @@ def setUp(self): try: # Try Travis defaults. # No real user should allow root access with a blank password. - self.conn = pymysql.connect(host='localhost', user='root', passwd='', - db='pandas_nosetest') + self.conn = pymysql.connect(host='localhost', user='root', + passwd='', db='pandas_nosetest') except: pass else: return try: self.conn = pymysql.connect(read_default_group='pandas') - except pymysql.ProgrammingError as e: + except pymysql.ProgrammingError: raise nose.SkipTest( "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - except pymysql.Error as e: + except pymysql.Error: raise nose.SkipTest( "Cannot connect to database. " "Create a group of connection parameters under the heading " "[pandas] in your system's mysql default file, " "typically located at ~/.my.cnf or /etc/.my.cnf. ") - def test_basic(self): _skip_if_no_pymysql() frame = tm.makeTimeDataFrame() @@ -2586,7 +2655,6 @@ def test_execute_closed_connection(self): # Initialize connection again (needed for tearDown) self.setUp() - def test_na_roundtrip(self): _skip_if_no_pymysql() pass @@ -2598,7 +2666,8 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.write_frame(frame, name='test_table', con=self.conn, flavor='mysql') + sql.write_frame(frame, name='test_table', + con=self.conn, flavor='mysql') result = sql.read_frame("select * from test_table", self.conn) # HACK! Change this once indexes are handled properly. @@ -2617,7 +2686,8 @@ def _check_roundtrip(self, frame): with warnings.catch_warnings(): warnings.filterwarnings("ignore", "Unknown table.*") cur.execute(drop_sql) - sql.write_frame(frame2, name='test_table2', con=self.conn, flavor='mysql') + sql.write_frame(frame2, name='test_table2', + con=self.conn, flavor='mysql') result = sql.read_frame("select * from test_table2", self.conn, index_col='Idx') expected = frame.copy() @@ -2629,16 +2699,17 @@ def _check_roundtrip(self, frame): def test_tquery(self): try: - import pymysql + import pymysql # noqa except ImportError: raise nose.SkipTest("no pymysql") frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test_table" cur = self.conn.cursor() cur.execute(drop_sql) - sql.write_frame(frame, name='test_table', con=self.conn, flavor='mysql') + sql.write_frame(frame, name='test_table', + con=self.conn, flavor='mysql') result = sql.tquery("select A from test_table", self.conn) - expected = Series(frame.A.values, frame.index) # not to have name + expected = Series(frame.A.values, frame.index) # not to have name result = Series(result, frame.index) tm.assert_series_equal(result, expected) @@ -2654,14 +2725,15 @@ def test_tquery(self): def test_uquery(self): try: - import pymysql + import pymysql # noqa except ImportError: raise nose.SkipTest("no pymysql") frame = tm.makeTimeDataFrame() drop_sql = "DROP TABLE IF EXISTS test_table" cur = self.conn.cursor() cur.execute(drop_sql) - sql.write_frame(frame, name='test_table', con=self.conn, flavor='mysql') + sql.write_frame(frame, name='test_table', + con=self.conn, flavor='mysql') stmt = 'INSERT INTO test_table VALUES(2.314, -123.1, 1.234, 2.3)' self.assertEqual(sql.uquery(stmt, con=self.conn), 1) @@ -2681,14 +2753,15 @@ def test_keyword_as_column_names(self): ''' ''' _skip_if_no_pymysql() - df = DataFrame({'From':np.ones(5)}) - sql.write_frame(df, con = self.conn, name = 'testkeywords', + df = DataFrame({'From': np.ones(5)}) + sql.write_frame(df, con=self.conn, name='testkeywords', if_exists='replace', flavor='mysql') def test_if_exists(self): _skip_if_no_pymysql() df_if_exists_1 = DataFrame({'col1': [1, 2], 'col2': ['A', 'B']}) - df_if_exists_2 = DataFrame({'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) + df_if_exists_2 = DataFrame( + {'col1': [3, 4, 5], 'col2': ['C', 'D', 'E']}) table_name = 'table_if_exists' sql_select = "SELECT * FROM %s" % table_name diff --git a/pandas/io/tests/test_stata.py b/pandas/io/tests/test_stata.py index 86dfbc8f76a9b..e1e12e47457f9 100644 --- a/pandas/io/tests/test_stata.py +++ b/pandas/io/tests/test_stata.py @@ -18,7 +18,7 @@ from pandas.core.common import is_categorical_dtype from pandas.io.parsers import read_csv from pandas.io.stata import (read_stata, StataReader, InvalidColumnName, - PossiblePrecisionLoss, StataMissingValue) + PossiblePrecisionLoss, StataMissingValue) import pandas.util.testing as tm from pandas.tslib import NaT from pandas import compat @@ -92,14 +92,14 @@ def test_read_empty_dta(self): empty_ds = DataFrame(columns=['unit']) # GH 7369, make sure can read a 0-obs dta file with tm.ensure_clean() as path: - empty_ds.to_stata(path,write_index=False) + empty_ds.to_stata(path, write_index=False) empty_ds2 = read_stata(path) tm.assert_frame_equal(empty_ds, empty_ds2) def test_data_method(self): # Minimal testing of legacy data method with StataReader(self.dta1_114) as rdr: - with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(record=True) as w: # noqa parsed_114_data = rdr.data() with StataReader(self.dta1_114) as rdr: @@ -184,9 +184,12 @@ def test_read_dta2(self): # buggy test because of the NaT comparison on certain platforms # Format 113 test fails since it does not support tc and tC formats # tm.assert_frame_equal(parsed_113, expected) - tm.assert_frame_equal(parsed_114, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_115, expected, check_datetimelike_compat=True) - tm.assert_frame_equal(parsed_117, expected, check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_114, expected, + check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_115, expected, + check_datetimelike_compat=True) + tm.assert_frame_equal(parsed_117, expected, + check_datetimelike_compat=True) def test_read_dta3(self): parsed_113 = self.read_dta(self.dta3_113) @@ -228,7 +231,8 @@ def test_read_dta4(self): 'labeled_with_missings', 'float_labelled']) # these are all categoricals - expected = pd.concat([expected[col].astype('category') for col in expected], axis=1) + expected = pd.concat([expected[col].astype('category') + for col in expected], axis=1) tm.assert_frame_equal(parsed_113, expected) tm.assert_frame_equal(parsed_114, expected) @@ -248,7 +252,6 @@ def test_read_dta12(self): tm.assert_frame_equal(parsed_117, expected, check_dtype=False) - def test_read_dta18(self): parsed_118 = self.read_dta(self.dta22_118) parsed_118["Bytes"] = parsed_118["Bytes"].astype('O') @@ -257,16 +260,18 @@ def test_read_dta18(self): ['Dog', 'Boston', u'Uzunköprü', np.nan, np.nan, np.nan, np.nan], ['Plane', 'Rome', u'Tromsø', 0, 0.0, 'option a', 0.0], ['Potato', 'Tokyo', u'Elâzığ', -4, 4.0, 4, 4], - ['', '', '', 0, 0.3332999, 'option a', 1/3.] + ['', '', '', 0, 0.3332999, 'option a', 1 / 3.] ], - columns=['Things', 'Cities', 'Unicode_Cities_Strl', 'Ints', 'Floats', 'Bytes', 'Longs']) + columns=['Things', 'Cities', 'Unicode_Cities_Strl', + 'Ints', 'Floats', 'Bytes', 'Longs']) expected["Floats"] = expected["Floats"].astype(np.float32) for col in parsed_118.columns: tm.assert_almost_equal(parsed_118[col], expected[col]) with StataReader(self.dta22_118) as rdr: vl = rdr.variable_labels() - vl_expected = {u'Unicode_Cities_Strl': u'Here are some strls with Ünicode chars', + vl_expected = {u'Unicode_Cities_Strl': + u'Here are some strls with Ünicode chars', u'Longs': u'long data', u'Things': u'Here are some things', u'Bytes': u'byte data', @@ -305,8 +310,8 @@ def test_write_dta6(self): def test_read_write_dta10(self): original = DataFrame(data=[["string", "object", 1, 1.1, np.datetime64('2003-12-25')]], - columns=['string', 'object', 'integer', 'floating', - 'datetime']) + columns=['string', 'object', 'integer', + 'floating', 'datetime']) original["object"] = Series(original["object"], dtype=object) original.index.name = 'index' original.index = original.index.astype(np.int32) @@ -327,7 +332,7 @@ def test_stata_doc_examples(self): def test_write_preserves_original(self): # 9795 np.random.seed(423) - df = pd.DataFrame(np.random.randn(5,4), columns=list('abcd')) + df = pd.DataFrame(np.random.randn(5, 4), columns=list('abcd')) df.ix[2, 'a':'c'] = np.nan df_copy = df.copy() with tm.ensure_clean() as path: @@ -348,18 +353,20 @@ def test_encoding(self): else: expected = raw.kreis1849.str.decode("latin-1")[0] self.assertEqual(result, expected) - self.assertIsInstance(result, unicode) + self.assertIsInstance(result, unicode) # noqa with tm.ensure_clean() as path: - encoded.to_stata(path,encoding='latin-1', write_index=False) + encoded.to_stata(path, encoding='latin-1', write_index=False) reread_encoded = read_stata(path, encoding='latin-1') tm.assert_frame_equal(encoded, reread_encoded) def test_read_write_dta11(self): original = DataFrame([(1, 2, 3, 4)], - columns=['good', compat.u('b\u00E4d'), '8number', 'astringwithmorethan32characters______']) + columns=['good', compat.u('b\u00E4d'), '8number', + 'astringwithmorethan32characters______']) formatted = DataFrame([(1, 2, 3, 4)], - columns=['good', 'b_d', '_8number', 'astringwithmorethan32characters_']) + columns=['good', 'b_d', '_8number', + 'astringwithmorethan32characters_']) formatted.index.name = 'index' formatted = formatted.astype(np.int32) @@ -370,7 +377,8 @@ def test_read_write_dta11(self): tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal( + written_and_read_again.set_index('index'), formatted) def test_read_write_dta12(self): original = DataFrame([(1, 2, 3, 4, 5, 6)], @@ -393,10 +401,12 @@ def test_read_write_dta12(self): with tm.ensure_clean() as path: with warnings.catch_warnings(record=True) as w: original.to_stata(path, None) - tm.assert_equal(len(w), 1) # should get a warning for that format. + # should get a warning for that format. + tm.assert_equal(len(w), 1) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), formatted) + tm.assert_frame_equal( + written_and_read_again.set_index('index'), formatted) def test_read_write_dta13(self): s1 = Series(2**9, dtype=np.int16) @@ -420,7 +430,8 @@ def test_read_write_reread_dta14(self): for col in cols: expected[col] = expected[col]._convert(datetime=True, numeric=True) expected['float_'] = expected['float_'].astype(np.float32) - expected['date_td'] = pd.to_datetime(expected['date_td'], errors='coerce') + expected['date_td'] = pd.to_datetime( + expected['date_td'], errors='coerce') parsed_113 = self.read_dta(self.dta14_113) parsed_113.index.name = 'index' @@ -438,7 +449,8 @@ def test_read_write_reread_dta14(self): with tm.ensure_clean() as path: parsed_114.to_stata(path, {'date_td': 'td'}) written_and_read_again = self.read_dta(path) - tm.assert_frame_equal(written_and_read_again.set_index('index'), parsed_114) + tm.assert_frame_equal( + written_and_read_again.set_index('index'), parsed_114) def test_read_write_reread_dta15(self): expected = self.read_csv(self.csv15) @@ -447,7 +459,8 @@ def test_read_write_reread_dta15(self): expected['long_'] = expected['long_'].astype(np.int32) expected['float_'] = expected['float_'].astype(np.float32) expected['double_'] = expected['double_'].astype(np.float64) - expected['date_td'] = expected['date_td'].apply(datetime.strptime, args=('%Y-%m-%d',)) + expected['date_td'] = expected['date_td'].apply( + datetime.strptime, args=('%Y-%m-%d',)) parsed_113 = self.read_dta(self.dta15_113) parsed_114 = self.read_dta(self.dta15_114) @@ -464,10 +477,12 @@ def test_timestamp_and_label(self): time_stamp = datetime(2000, 2, 29, 14, 21) data_label = 'This is a data file.' with tm.ensure_clean() as path: - original.to_stata(path, time_stamp=time_stamp, data_label=data_label) + original.to_stata(path, time_stamp=time_stamp, + data_label=data_label) with StataReader(path) as reader: - parsed_time_stamp = dt.datetime.strptime(reader.time_stamp, ('%d %b %Y %H:%M')) + parsed_time_stamp = dt.datetime.strptime( + reader.time_stamp, ('%d %b %Y %H:%M')) assert parsed_time_stamp == time_stamp assert reader.data_label == data_label @@ -507,8 +522,8 @@ def test_no_index(self): with tm.ensure_clean() as path: original.to_stata(path, write_index=False) written_and_read_again = self.read_dta(path) - tm.assertRaises(KeyError, - lambda: written_and_read_again['index_not_written']) + tm.assertRaises( + KeyError, lambda: written_and_read_again['index_not_written']) def test_string_no_dates(self): s1 = Series(['a', 'A longer string']) @@ -616,7 +631,7 @@ def test_variable_labels(self): sr_117 = rdr.variable_labels() keys = ('var1', 'var2', 'var3') labels = ('label1', 'label2', 'label3') - for k,v in compat.iteritems(sr_115): + for k, v in compat.iteritems(sr_115): self.assertTrue(k in sr_117) self.assertTrue(v == sr_117[k]) self.assertTrue(k in keys) @@ -626,7 +641,8 @@ def test_minimal_size_col(self): str_lens = (1, 100, 244) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + s['s' + str(str_len)] = Series(['a' * str_len, + 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.ensure_clean() as path: original.to_stata(path, write_index=False) @@ -643,15 +659,16 @@ def test_excessively_long_string(self): str_lens = (1, 244, 500) s = {} for str_len in str_lens: - s['s' + str(str_len)] = Series(['a' * str_len, 'b' * str_len, 'c' * str_len]) + s['s' + str(str_len)] = Series(['a' * str_len, + 'b' * str_len, 'c' * str_len]) original = DataFrame(s) with tm.assertRaises(ValueError): with tm.ensure_clean() as path: original.to_stata(path) def test_missing_value_generator(self): - types = ('b','h','l') - df = DataFrame([[0.0]],columns=['float_']) + types = ('b', 'h', 'l') + df = DataFrame([[0.0]], columns=['float_']) with tm.ensure_clean() as path: df.to_stata(path) with StataReader(path) as rdr: @@ -660,20 +677,22 @@ def test_missing_value_generator(self): expected_values.insert(0, '.') for t in types: offset = valid_range[t][1] - for i in range(0,27): - val = StataMissingValue(offset+1+i) + for i in range(0, 27): + val = StataMissingValue(offset + 1 + i) self.assertTrue(val.string == expected_values[i]) # Test extremes for floats - val = StataMissingValue(struct.unpack(' Date: Wed, 20 Jan 2016 22:35:01 -0800 Subject: [PATCH 2/2] Address comments on parse_dates part of docstring --- pandas/io/parsers.py | 23 +++++++---------------- 1 file changed, 7 insertions(+), 16 deletions(-) diff --git a/pandas/io/parsers.py b/pandas/io/parsers.py index 293a4701eb46d..f06ad927bb61b 100755 --- a/pandas/io/parsers.py +++ b/pandas/io/parsers.py @@ -101,23 +101,14 @@ class ParserWarning(Warning): If na_values are specified and keep_default_na is False the default NaN values are overridden, otherwise they're appended to parse_dates : various, default False - Acceptable input types - * boolean. If True -> try parsing the index. - - * list of ints or names - - * If [1, 2, 3] -> try parsing columns 1, 2, 3 each as a separate date - column. - - * list of lists - - * If [[1, 3]] -> combine columns 1 and 3 and parse as a single date - column. - - * dict - - * {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result 'foo' + * boolean. If True -> try parsing the index. + * list of ints or names. e.g. If [1, 2, 3] -> try parsing columns 1, 2, 3 + each as a separate date column. + * list of lists. e.g. If [[1, 3]] -> combine columns 1 and 3 and parse as + a single date column. + * dict, e.g. {'foo' : [1, 3]} -> parse columns 1, 3 as date and call result + 'foo' Note: A fast-path exists for iso8601-formatted dates. keep_date_col : boolean, default False If True and parse_dates specifies combining multiple columns then