Skip to content

Commit 050a5e7

Browse files
committed
Class to read OpenDocument Tables
This is primarly intended for OpenDocument spreadsheets like what is generated by LibreOffice calc but will also work with LibreOffice Writer.
1 parent 15d8178 commit 050a5e7

File tree

11 files changed

+300
-0
lines changed

11 files changed

+300
-0
lines changed

ci/deps/travis-36.yaml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ dependencies:
1515
- nomkl
1616
- numexpr
1717
- numpy
18+
- odfpy
1819
- openpyxl
1920
- psycopg2
2021
- pyarrow=0.9.0

pandas/io/excel/_base.py

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -749,9 +749,11 @@ class ExcelFile(object):
749749
"""
750750

751751
from pandas.io.excel._xlrd import _XlrdReader
752+
from pandas.io.excel._odfreader import _ODFReader
752753

753754
_engines = {
754755
'xlrd': _XlrdReader,
756+
'odf': _ODFReader,
755757
}
756758

757759
def __init__(self, io, engine=None):

pandas/io/excel/_odfreader.py

Lines changed: 165 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,165 @@
1+
import pandas
2+
from pandas.io.parsers import TextParser
3+
4+
5+
class _ODFReader(object):
6+
"""Read tables out of OpenDocument formatted files
7+
8+
Parameters
9+
----------
10+
filepath_or_stream: string, path to be parsed or
11+
an open readable stream.
12+
"""
13+
def __init__(self, filepath_or_stream):
14+
try:
15+
from odf.opendocument import load as document_load
16+
from odf.table import Table
17+
except ImportError:
18+
raise ImportError("Install odfpy for OpenDocument support")
19+
20+
self.filepath_or_stream = None
21+
self.document = None
22+
self.tables = None
23+
self.filepath_or_stream = filepath_or_stream
24+
self.document = document_load(filepath_or_stream)
25+
self.tables = self.document.getElementsByType(Table)
26+
27+
@property
28+
def sheet_names(self):
29+
"""Return table names is the document"""
30+
from odf.namespaces import TABLENS
31+
return [t.attributes[(TABLENS, 'name')] for t in self.tables]
32+
33+
def get_sheet_by_index(self, index):
34+
return self.__get_table(self.tables[index])
35+
36+
def get_sheet_by_name(self, name):
37+
i = self.sheet_names.index(name)
38+
return self.__get_table(self.tables[i])
39+
40+
def get_sheet(self, name):
41+
"""Given a sheet name or index, return the root ODF Table node
42+
"""
43+
if isinstance(name, str):
44+
return self.get_sheet_by_name(name)
45+
elif isinstance(name, int):
46+
return self.get_sheet_by_index(name)
47+
else:
48+
raise ValueError(
49+
'Unrecognized sheet identifier type {}. Please use'
50+
'a string or integer'.format(type(name)))
51+
52+
def parse(self, sheet_name=0, **kwds):
53+
data = self.get_sheet(sheet_name)
54+
parser = TextParser(data, **kwds)
55+
return parser.read()
56+
57+
def __get_table(self, sheet):
58+
"""Parse an ODF Table into a list of lists
59+
"""
60+
from odf.table import TableCell, TableRow
61+
62+
sheet_rows = sheet.getElementsByType(TableRow)
63+
table = []
64+
empty_rows = 0
65+
max_row_len = 0
66+
for i, sheet_row in enumerate(sheet_rows):
67+
sheet_cells = sheet_row.getElementsByType(TableCell)
68+
empty_cells = 0
69+
table_row = []
70+
for j, sheet_cell in enumerate(sheet_cells):
71+
value = self.__get_cell_value(sheet_cell)
72+
column_repeat = self.__get_cell_repeat(sheet_cell)
73+
74+
if len(sheet_cell.childNodes) == 0:
75+
empty_cells += column_repeat
76+
else:
77+
if empty_cells > 0:
78+
table_row.extend([None] * empty_cells)
79+
empty_cells = 0
80+
table_row.extend([value] * column_repeat)
81+
82+
if max_row_len < len(table_row):
83+
max_row_len = len(table_row)
84+
85+
row_repeat = self.__get_row_repeat(sheet_row)
86+
if self.__is_empty_row(sheet_row):
87+
empty_rows += row_repeat
88+
else:
89+
if empty_rows > 0:
90+
# add blank rows to our table
91+
table.extend([[None]] * empty_rows)
92+
empty_rows = 0
93+
table.append(table_row)
94+
95+
# Make our table square
96+
for row in table:
97+
if len(row) < max_row_len:
98+
row.extend([None] * (max_row_len - len(row)))
99+
100+
return table
101+
102+
def __get_row_repeat(self, row):
103+
"""Return number of times this row was repeated
104+
105+
Repeating an empty row appeared to be a common way
106+
of representing sparse rows in the table.
107+
"""
108+
from odf.namespaces import TABLENS
109+
repeat = row.attributes.get((TABLENS, 'number-rows-repeated'))
110+
if repeat is None:
111+
return 1
112+
return int(repeat)
113+
114+
def __get_cell_repeat(self, cell):
115+
from odf.namespaces import TABLENS
116+
repeat = cell.attributes.get((TABLENS, 'number-columns-repeated'))
117+
if repeat is None:
118+
return 1
119+
return int(repeat)
120+
121+
def __is_empty_row(self, row):
122+
"""Helper function to find empty rows
123+
"""
124+
for column in row.childNodes:
125+
if len(column.childNodes) > 0:
126+
return False
127+
128+
return True
129+
130+
def __get_cell_value(self, cell):
131+
from odf.namespaces import OFFICENS
132+
cell_type = cell.attributes.get((OFFICENS, 'value-type'))
133+
if cell_type == 'boolean':
134+
cell_value = cell.attributes.get((OFFICENS, 'boolean'))
135+
return bool(cell_value)
136+
elif cell_type in ('float', 'percentage'):
137+
cell_value = cell.attributes.get((OFFICENS, 'value'))
138+
return float(cell_value)
139+
elif cell_type == 'string':
140+
return str(cell)
141+
elif cell_type == 'currency':
142+
cell_value = cell.attributes.get((OFFICENS, 'value'))
143+
return float(cell_value)
144+
elif cell_type == 'date':
145+
cell_value = cell.attributes.get((OFFICENS, 'date-value'))
146+
return pandas.Timestamp(cell_value)
147+
elif cell_type == 'time':
148+
cell_value = cell.attributes.get((OFFICENS, 'time-value'))
149+
return(pandas_isoduration_compatibility(cell_value))
150+
elif cell_type is None:
151+
return None
152+
else:
153+
raise ValueError('Unrecognized type {}'.format(cell_type))
154+
155+
156+
def pandas_isoduration_compatibility(duration):
157+
"""Libreoffice returns durations without any day attributes
158+
159+
For example PT3H45M0S. The current pandas Timedelta
160+
parse requires the presence of a day component.
161+
Workaround for https://github.com/pandas-dev/pandas/issues/25422
162+
"""
163+
if duration.startswith('PT'):
164+
duration = 'P0DT' + duration[2:]
165+
return pandas.Timedelta(duration)
10.6 KB
Binary file not shown.

pandas/tests/io/data/datatypes.ods

10.4 KB
Binary file not shown.

pandas/tests/io/data/headers.ods

8.13 KB
Binary file not shown.
8.3 KB
Binary file not shown.
7.35 KB
Binary file not shown.
7.71 KB
Binary file not shown.

pandas/tests/io/data/writertable.odt

10.1 KB
Binary file not shown.

0 commit comments

Comments
 (0)