diff --git a/dashboard/libs/repository/fs_repository.py b/dashboard/libs/repository/fs_repository.py index 2d9d78c..95e56a3 100644 --- a/dashboard/libs/repository/fs_repository.py +++ b/dashboard/libs/repository/fs_repository.py @@ -4,6 +4,10 @@ import pandas as pd from .repository import AbstractRepository +ACCEPTABLE_EXTENTIONS = { + "csv": [".csv"], + "excel": [".xls", ".xlsx"], +} class FSRepository(AbstractRepository): def __init__(self, name, basepath, metadata_engine=None): @@ -20,18 +24,30 @@ class FSRepository(AbstractRepository): if only_files: return [ - str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir() + str(f.relative_to(dirpath)) + for f in dirpath.iterdir() + if not f.is_dir() and not str(f).startswith(".") ] if only_directories: if recursive: - return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()] + return [ + str(f[0].relative_to(dirpath)) + for f in dirpath.walk() + if not str(f).startswith(".") + ] return [ - str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir() + str(f.relative_to(dirpath)) + for f in dirpath.iterdir() + if f.is_dir() and not str(f).startswith(".") ] - return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()] + return [ + str(f.relative_to(dirpath)) + for f in dirpath.iterdir() + if not str(f).startswith(".") + ] def schemas(self, recursive=True) -> list[str]: return self.ls("", only_directories=True, recursive=True) @@ -49,16 +65,17 @@ class FSRepository(AbstractRepository): table_path = self.build_table_path(table, schema) pass - def read(self, table: str, schema: str = ".", read_options={}): + def read(self, table: str, schema: str = ".", **read_options): table_path = self.build_table_path(table, schema) + assert table_path.exists() extension = table_path.suffix - if extension == ".csv": + if extension in ACCEPTABLE_EXTENTIONS["csv"]: return pd.read_csv(table_path, **read_options) - if extension == ".xlsx": - return pd.read_excel(table_path, **read_options) + if extension in ACCEPTABLE_EXTENTIONS["excel"]: + return pd.read_excel(table_path, engine = "openpyxl", **read_options) - raise ValueError("Can't open the table") + raise ValueError("Bad extention. Can't open the table.") def write(self, content, table: str, schema: str = "."): table_path = self.build_table_path(table, schema) diff --git a/requirements.txt b/requirements.txt index dd51a46..93d88e7 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,7 +1,6 @@ jupyter==1.0.0 -pandas==1.5.0 -pdf-oralia==0.3.11 -pydantic==2.6.1 +pandas==2.2.2 +pydantic==2.8.2 click==8.1.7 -dlt[duckdb]>=0.4.3a0 -openpyxl>=3.0.0 +openpyxl==3.1.5 +xlrd==2.0.1 diff --git a/tests/repository/fs_examples/username-password-recovery-code.xls b/tests/repository/fs_examples/username-password-recovery-code.xls new file mode 100644 index 0000000..af0329b Binary files /dev/null and b/tests/repository/fs_examples/username-password-recovery-code.xls differ diff --git a/tests/repository/fs_examples/username-password-recovery-code.xlsx b/tests/repository/fs_examples/username-password-recovery-code.xlsx index 1723d5c..1e2d2dd 100644 Binary files a/tests/repository/fs_examples/username-password-recovery-code.xlsx and b/tests/repository/fs_examples/username-password-recovery-code.xlsx differ diff --git a/tests/repository/fs_examples/username.csv b/tests/repository/fs_examples/username.csv index 006ac8e..45d43d4 100644 --- a/tests/repository/fs_examples/username.csv +++ b/tests/repository/fs_examples/username.csv @@ -1,4 +1,4 @@ -Username; Identifier;First name;Last name +Username;Identifier;First name;Last name booker12;9012;Rachel;Booker grey07;2070;Laura;Grey johnson81;4081;Craig;Johnson diff --git a/tests/repository/test_fs_repository.py b/tests/repository/test_fs_repository.py index c005fed..b6617dd 100644 --- a/tests/repository/test_fs_repository.py +++ b/tests/repository/test_fs_repository.py @@ -2,6 +2,7 @@ import shutil from pathlib import Path import pytest +from pandas import pandas from dashboard.libs.repository.fs_repository import FSRepository @@ -42,5 +43,42 @@ def test_init(location): assert repo.tables("username") == [ "username.csv", "username-password-recovery-code.xlsx", + "username-password-recovery-code.xls", ] assert repo.tables("salary") == ["salary.pdf"] + + +def test_read_csv(location): + repo = FSRepository("example", location) + username = repo.read("username.csv", "username", delimiter=";") + assert list(username.columns) == [ + "Username", + "Identifier", + "First name", + "Last name", + ] + assert len(username.index) == 5 + + +def test_fake_read_xlsx(location): + repo = FSRepository("example", location) + df = pandas.read_excel( + location / "username" / "username-password-recovery-code.xls" + ) + print(df) + + +def test_read_xlsx(location): + repo = FSRepository("example", location) + username = repo.read("username-password-recovery-code.xls", "username") + assert list(username.columns) == [ + "Username", + "Identifier", + "One-time password", + "Recovery code", + "First name", + "Last name", + "Department", + "Location", + ] + assert len(username.index) == 5