Skip to content

Commit

Permalink
ExcelReader: Migrate to openpyxl
Browse files Browse the repository at this point in the history
  • Loading branch information
VesnaT committed Dec 19, 2019
1 parent 6ef2e27 commit a0c1d41
Show file tree
Hide file tree
Showing 5 changed files with 77 additions and 27 deletions.
89 changes: 63 additions & 26 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
import subprocess
import sys
import warnings
from typing import List, Iterable

from ast import literal_eval
from collections import OrderedDict, Counter, defaultdict
Expand All @@ -26,6 +27,7 @@

import xlrd
import xlsxwriter
import openpyxl

from Orange.data import (
_io, is_discrete_values, MISSING_VALUES, Table, Domain, Variable,
Expand Down Expand Up @@ -405,15 +407,15 @@ def __init__(self, filename):
self.sheet = None

@property
def sheets(self):
def sheets(self) -> List:
"""FileFormats with a notion of sheets should override this property
to return a list of sheet names in the file.
Returns
-------
a list of sheet names
"""
return ()
return []

def select_sheet(self, sheet):
"""Select sheet to be read
Expand Down Expand Up @@ -1007,33 +1009,12 @@ def __init__(self, filename):
super().__init__(filename=filename)
self._workbook = None

@property
def workbook(self):
if not self._workbook:
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
return self._workbook

@property
@lru_cache(1)
def sheets(self):
if self.workbook:
return self.workbook.sheet_names()
else:
return ()
def get_cells(self) -> Iterable:
raise NotImplementedError

def read(self):
if self.sheet:
ss = self.workbook.sheet_by_name(self.sheet)
else:
ss = self.workbook.sheet_by_index(0)
try:
first_row = next(i for i in range(ss.nrows) if any(ss.row_values(i)))
first_col = next(i for i in range(ss.ncols) if ss.cell_value(first_row, i))
row_len = ss.row_len(first_row)
cells = filter(any,
[[str(ss.cell_value(row, col)) if col < ss.row_len(row) else ''
for col in range(first_col, row_len)]
for row in range(first_row, ss.nrows)])
cells = self.get_cells()
table = self.data_table(cells)
table.name = path.splitext(path.split(self.filename)[-1])[0]
if self.sheet:
Expand Down Expand Up @@ -1069,12 +1050,68 @@ class ExcelReader(_BaseExcelReader):
EXTENSIONS = ('.xlsx',)
DESCRIPTION = 'Microsoft Excel spreadsheet'

@property
def workbook(self) -> openpyxl.Workbook:
if not self._workbook:
self._workbook = openpyxl.load_workbook(self.filename)
return self._workbook

@property
@lru_cache(1)
def sheets(self) -> List:
return self.workbook.sheetnames if self.workbook else []

def get_cells(self) -> Iterable:
def str_(x):
return str(x) if x is not None else ""

sheet = self._get_active_sheet()
cells = ([str_(sheet.cell(row, col).value)
for col in range(sheet.min_column, sheet.max_column + 1)]
for row in range(sheet.min_row, sheet.max_row + 1))
return filter(any, cells)

def _get_active_sheet(self) -> openpyxl.worksheet.worksheet.Worksheet:
if self.sheet:
return self.workbook[self.sheet]
else:
return self.workbook.active


class XlsReader(_BaseExcelReader):
"""Reader for .xls files"""
EXTENSIONS = ('.xls',)
DESCRIPTION = 'Microsoft Excel 97-2004 spreadsheet'

@property
def workbook(self) -> xlrd.Book:
if not self._workbook:
self._workbook = xlrd.open_workbook(self.filename, on_demand=True)
return self._workbook

@property
@lru_cache(1)
def sheets(self) -> List:
return self.workbook.sheet_names() if self.workbook else []

def get_cells(self) -> Iterable:
sheet = self._get_active_sheet()
first_row = next(i for i in range(sheet.nrows)
if any(sheet.row_values(i)))
first_col = next(i for i in range(sheet.ncols)
if sheet.cell_value(first_row, i))
row_len = sheet.row_len(first_row)
return filter(any, ([str(sheet.cell_value(row, col))
if col < sheet.row_len(row) else ''
for col in range(first_col, row_len)]
for row in range(first_row, sheet.nrows)))

def _get_active_sheet(self) -> xlrd.sheet.Sheet:
if self.sheet:
return self.workbook.sheet_by_name(self.sheet)
else:
return self.workbook.sheet_by_index(0)


class DotReader(FileFormat):
"""Writer for dot (graph) files"""
Expand Down
2 changes: 1 addition & 1 deletion Orange/tests/test_tab_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ def test_sheets(self):
file1 = io.StringIO("\n".join("xd dbac"))
reader = TabReader(file1)

self.assertEqual(reader.sheets, ())
self.assertEqual(reader.sheets, [])

def test_attributes_saving(self):
tempdir = tempfile.mkdtemp()
Expand Down
12 changes: 12 additions & 0 deletions Orange/tests/test_xlsx_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,18 @@ def wrapper(self):
return wrapper


class TestExcelReader(unittest.TestCase):
def test_read_round_floats(self):
table = read_file(get_xlsx_reader, "round_floats.xlsx")
domain = table.domain
self.assertIsNone(domain.class_var)
self.assertEqual(len(domain.metas), 0)
self.assertEqual(len(domain.attributes), 3)
self.assertIsInstance(domain[0], ContinuousVariable)
self.assertIsInstance(domain[1], ContinuousVariable)
self.assertListEqual(domain[2].values, ["1", "2"])


class TestExcelHeader0(unittest.TestCase):
@test_xlsx_xls
def test_read(self, reader: Callable[[str], io.FileFormat]):
Expand Down
Binary file added Orange/tests/xlsx_files/round_floats.xlsx
Binary file not shown.
1 change: 1 addition & 0 deletions requirements-core.txt
Original file line number Diff line number Diff line change
Expand Up @@ -21,3 +21,4 @@ requests
openTSNE>=0.3.11
pandas
pyyaml
openpyxl

0 comments on commit a0c1d41

Please sign in to comment.