Feat: test on pandas xlsx and ods file reader

This commit is contained in:
Bertrand Benjamin 2024-10-07 05:27:46 +02:00
parent 5450de8628
commit e794242a03
6 changed files with 69 additions and 15 deletions

View File

@ -4,6 +4,10 @@ import pandas as pd
from .repository import AbstractRepository from .repository import AbstractRepository
ACCEPTABLE_EXTENTIONS = {
"csv": [".csv"],
"excel": [".xls", ".xlsx"],
}
class FSRepository(AbstractRepository): class FSRepository(AbstractRepository):
def __init__(self, name, basepath, metadata_engine=None): def __init__(self, name, basepath, metadata_engine=None):
@ -20,18 +24,30 @@ class FSRepository(AbstractRepository):
if only_files: if only_files:
return [ return [
str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir() str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if not f.is_dir() and not str(f).startswith(".")
] ]
if only_directories: if only_directories:
if recursive: if recursive:
return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()]
return [ return [
str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir() str(f[0].relative_to(dirpath))
for f in dirpath.walk()
if not str(f).startswith(".")
] ]
return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()] return [
str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if f.is_dir() and not str(f).startswith(".")
]
return [
str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if not str(f).startswith(".")
]
def schemas(self, recursive=True) -> list[str]: def schemas(self, recursive=True) -> list[str]:
return self.ls("", only_directories=True, recursive=True) return self.ls("", only_directories=True, recursive=True)
@ -49,16 +65,17 @@ class FSRepository(AbstractRepository):
table_path = self.build_table_path(table, schema) table_path = self.build_table_path(table, schema)
pass pass
def read(self, table: str, schema: str = ".", read_options={}): def read(self, table: str, schema: str = ".", **read_options):
table_path = self.build_table_path(table, schema) table_path = self.build_table_path(table, schema)
assert table_path.exists()
extension = table_path.suffix extension = table_path.suffix
if extension == ".csv": if extension in ACCEPTABLE_EXTENTIONS["csv"]:
return pd.read_csv(table_path, **read_options) return pd.read_csv(table_path, **read_options)
if extension == ".xlsx": if extension in ACCEPTABLE_EXTENTIONS["excel"]:
return pd.read_excel(table_path, **read_options) return pd.read_excel(table_path, engine = "openpyxl", **read_options)
raise ValueError("Can't open the table") raise ValueError("Bad extention. Can't open the table.")
def write(self, content, table: str, schema: str = "."): def write(self, content, table: str, schema: str = "."):
table_path = self.build_table_path(table, schema) table_path = self.build_table_path(table, schema)

View File

@ -1,7 +1,6 @@
jupyter==1.0.0 jupyter==1.0.0
pandas==1.5.0 pandas==2.2.2
pdf-oralia==0.3.11 pydantic==2.8.2
pydantic==2.6.1
click==8.1.7 click==8.1.7
dlt[duckdb]>=0.4.3a0 openpyxl==3.1.5
openpyxl>=3.0.0 xlrd==2.0.1

View File

@ -1,4 +1,4 @@
Username; Identifier;First name;Last name Username;Identifier;First name;Last name
booker12;9012;Rachel;Booker booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson johnson81;4081;Craig;Johnson

1 Username Identifier First name Last name
2 booker12 9012 Rachel Booker
3 grey07 2070 Laura Grey
4 johnson81 4081 Craig Johnson

View File

@ -2,6 +2,7 @@ import shutil
from pathlib import Path from pathlib import Path
import pytest import pytest
from pandas import pandas
from dashboard.libs.repository.fs_repository import FSRepository from dashboard.libs.repository.fs_repository import FSRepository
@ -42,5 +43,42 @@ def test_init(location):
assert repo.tables("username") == [ assert repo.tables("username") == [
"username.csv", "username.csv",
"username-password-recovery-code.xlsx", "username-password-recovery-code.xlsx",
"username-password-recovery-code.xls",
] ]
assert repo.tables("salary") == ["salary.pdf"] assert repo.tables("salary") == ["salary.pdf"]
def test_read_csv(location):
repo = FSRepository("example", location)
username = repo.read("username.csv", "username", delimiter=";")
assert list(username.columns) == [
"Username",
"Identifier",
"First name",
"Last name",
]
assert len(username.index) == 5
def test_fake_read_xlsx(location):
repo = FSRepository("example", location)
df = pandas.read_excel(
location / "username" / "username-password-recovery-code.xls"
)
print(df)
def test_read_xlsx(location):
repo = FSRepository("example", location)
username = repo.read("username-password-recovery-code.xls", "username")
assert list(username.columns) == [
"Username",
"Identifier",
"One-time password",
"Recovery code",
"First name",
"Last name",
"Department",
"Location",
]
assert len(username.index) == 5