Feat: test on pandas xlsx and ods file reader

2024-10-07 05:27:46 +02:00
parent 5450de8628
commit e794242a03
6 changed files with 69 additions and 15 deletions
--- a/dashboard/libs/repository/fs_repository.py
+++ b/dashboard/libs/repository/fs_repository.py
@@ -4,6 +4,10 @@ import pandas as pd
 from .repository import AbstractRepository
 ACCEPTABLE_EXTENTIONS = {
    "csv": [".csv"],
    "excel": [".xls", ".xlsx"],
 }
 class FSRepository(AbstractRepository):
    def __init__(self, name, basepath, metadata_engine=None):
@@ -20,18 +24,30 @@ class FSRepository(AbstractRepository):
        if only_files:
            return [
-                str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir()
+                str(f.relative_to(dirpath))
                for f in dirpath.iterdir()
                if not f.is_dir() and not str(f).startswith(".")
            ]
        if only_directories:
            if recursive:
                return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()]
                return [
-                str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir()
+                    str(f[0].relative_to(dirpath))
                    for f in dirpath.walk()
                    if not str(f).startswith(".")
                ]
-        return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()]
+            return [
                str(f.relative_to(dirpath))
                for f in dirpath.iterdir()
                if f.is_dir() and not str(f).startswith(".")
            ]
        return [
            str(f.relative_to(dirpath))
            for f in dirpath.iterdir()
            if not str(f).startswith(".")
        ]
    def schemas(self, recursive=True) -> list[str]:
        return self.ls("", only_directories=True, recursive=True)
@@ -49,16 +65,17 @@ class FSRepository(AbstractRepository):
        table_path = self.build_table_path(table, schema)
        pass
-    def read(self, table: str, schema: str = ".", read_options={}):
+    def read(self, table: str, schema: str = ".", **read_options):
        table_path = self.build_table_path(table, schema)
        assert table_path.exists()
        extension = table_path.suffix
-        if extension == ".csv":
+        if extension in ACCEPTABLE_EXTENTIONS["csv"]:
            return pd.read_csv(table_path, **read_options)
-        if extension == ".xlsx":
+        if extension in ACCEPTABLE_EXTENTIONS["excel"]:
-            return pd.read_excel(table_path, **read_options)
+            return pd.read_excel(table_path, engine = "openpyxl", **read_options)
-        raise ValueError("Can't open the table")
+        raise ValueError("Bad extention. Can't open the table.")
    def write(self, content, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 jupyter==1.0.0 
-pandas==1.5.0
+pandas==2.2.2
-pdf-oralia==0.3.11
+pydantic==2.8.2
 pydantic==2.6.1
 click==8.1.7
-dlt[duckdb]>=0.4.3a0
+openpyxl==3.1.5
-openpyxl>=3.0.0
+xlrd==2.0.1
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
@@ -1,4 +1,4 @@
-Username; Identifier;First name;Last name
+Username;Identifier;First name;Last name
 booker12;9012;Rachel;Booker
 grey07;2070;Laura;Grey
 johnson81;4081;Craig;Johnson
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -2,6 +2,7 @@ import shutil
 from pathlib import Path
 import pytest
 from pandas import pandas
 from dashboard.libs.repository.fs_repository import FSRepository
@@ -42,5 +43,42 @@ def test_init(location):
    assert repo.tables("username") == [
        "username.csv",
        "username-password-recovery-code.xlsx",
        "username-password-recovery-code.xls",
    ]
    assert repo.tables("salary") == ["salary.pdf"]
 def test_read_csv(location):
    repo = FSRepository("example", location)
    username = repo.read("username.csv", "username", delimiter=";")
    assert list(username.columns) == [
        "Username",
        "Identifier",
        "First name",
        "Last name",
    ]
    assert len(username.index) == 5
 def test_fake_read_xlsx(location):
    repo = FSRepository("example", location)
    df = pandas.read_excel(
        location / "username" / "username-password-recovery-code.xls"
    )
    print(df)
 def test_read_xlsx(location):
    repo = FSRepository("example", location)
    username = repo.read("username-password-recovery-code.xls", "username")
    assert list(username.columns) == [
        "Username",
        "Identifier",
        "One-time password",
        "Recovery code",
        "First name",
        "Last name",
        "Department",
        "Location",
    ]
    assert len(username.index) == 5