Feat: test on pandas xlsx and ods file reader

Feat: start testing fs_repository
Feat: test consume_flux
2024-10-07 05:27:46 +02:00 · 2024-08-14 10:44:38 +02:00 · 2024-08-14 07:41:36 +02:00
10 changed files with 183 additions and 28 deletions
--- a/dashboard/libs/flux/flux.py
+++ b/dashboard/libs/flux/flux.py
@@ -25,7 +25,7 @@ class Flux(BaseModel):


 class State(BaseModel):
-    statuses: dict[str, str]
+    statuses: dict[str, dict]
    qty_out: int
    failed_lines: list[str]
    start: datetime
--- a/dashboard/libs/repository/fs_repository.py
+++ b/dashboard/libs/repository/fs_repository.py
@@ -4,61 +4,83 @@ import pandas as pd

 from .repository import AbstractRepository

+ACCEPTABLE_EXTENTIONS = {
+    "csv": [".csv"],
+    "excel": [".xls", ".xlsx"],
+}

 class FSRepository(AbstractRepository):
    def __init__(self, name, basepath, metadata_engine=None):
        self.name = name

        self.basepath = Path(basepath)
+        assert self.basepath.exists()
        self._metadata_engine = metadata_engine

-    def ls(self, dir, only_files=False, only_directories=False, recursive=False) -> list[str]:
-        dirpath = Path(dir)
+    def ls(
+        self, dir="", only_files=False, only_directories=False, recursive=False
+    ) -> list[str]:
+        dirpath = self.basepath / dir

        if only_files:
-            return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir()]
+            return [
+                str(f.relative_to(dirpath))
+                for f in dirpath.iterdir()
+                if not f.is_dir() and not str(f).startswith(".")
+            ]

        if only_directories:
            if recursive:
-                return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()]
+                return [
+                    str(f[0].relative_to(dirpath))
+                    for f in dirpath.walk()
+                    if not str(f).startswith(".")
+                ]

-            return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir()]
+            return [
+                str(f.relative_to(dirpath))
+                for f in dirpath.iterdir()
+                if f.is_dir() and not str(f).startswith(".")
+            ]

-        return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()]
+        return [
+            str(f.relative_to(dirpath))
+            for f in dirpath.iterdir()
+            if not str(f).startswith(".")
+        ]

    def schemas(self, recursive=True) -> list[str]:
-        dirpath = self.basepath
-        return self.ls(dirpath, only_directories=True, recursive=True)
+        return self.ls("", only_directories=True, recursive=True)

-    def tables(self, schema:str) -> list[str]:
-        dirpath = self.basepath / schema
-        return self.ls(dirpath, only_files=True)
+    def tables(self, schema: str = ".") -> list[str]:
+        return self.ls(schema, only_files=True)

-    def build_table_path(self, table:str, schema:str):
+    def build_table_path(self, table: str, schema: str):
        table_path = self.basepath
-        if schema == '.':
+        if schema == ".":
            return table_path / table
        return table_path / schema / table

-    def info(self, table:str, schema:str='.'):
+    def infos(self, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass

-    def read(self, table:str, schema:str='.', read_options={}):
+    def read(self, table: str, schema: str = ".", **read_options):
        table_path = self.build_table_path(table, schema)
+        assert table_path.exists()
        extension = table_path.suffix
-        if extension == '.csv':
+        if extension in ACCEPTABLE_EXTENTIONS["csv"]:
            return pd.read_csv(table_path, **read_options)

-        if extension == '.xlsx':
-            return pd.read_excel(table_path, **read_options)
+        if extension in ACCEPTABLE_EXTENTIONS["excel"]:
+            return pd.read_excel(table_path, engine = "openpyxl", **read_options)

-        raise ValueError("Can't open the table")
+        raise ValueError("Bad extention. Can't open the table.")

-    def write(self, content, table:str, schema:str='.'):
+    def write(self, content, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass

-    def delete(self, table:str, schema:str='.'):
+    def delete_table(self, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 jupyter==1.0.0 
-pandas==1.5.0
-pdf-oralia==0.3.11
-pydantic==2.6.1
+pandas==2.2.2
+pydantic==2.8.2
 click==8.1.7
-dlt[duckdb]>=0.4.3a0
-openpyxl>=3.0.0
+openpyxl==3.1.5
+xlrd==2.0.1
--- a/tests/repository/init.py
+++ b/tests/repository/init.py
--- a/tests/repository/fs_examples/salary.pdf
+++ b/tests/repository/fs_examples/salary.pdf
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
@@ -0,0 +1,7 @@
+Username;Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -0,0 +1,84 @@
+import shutil
+from pathlib import Path
+
+import pytest
+from pandas import pandas
+
+from dashboard.libs.repository.fs_repository import FSRepository
+
+EXAMPLE_DIR = "./tests/repository/fs_examples/"
+
+
+@pytest.fixture
+def location(tmp_path):
+    loc = tmp_path
+    username_loc = loc / "username"
+    username_loc.mkdir()
+    salary_loc = loc / "salary"
+    salary_loc.mkdir()
+    example_src = Path(EXAMPLE_DIR)
+
+    for f in example_src.glob("*"):
+        if "username" in str(f):
+            shutil.copy(f, username_loc)
+        else:
+            shutil.copy(f, salary_loc)
+
+    return loc
+
+
+def test_init(location):
+    repo = FSRepository("example", location)
+    assert repo.ls() == [
+        "username",
+        "salary",
+    ]
+    assert repo.schemas() == [
+        ".",
+        "username",
+        "salary",
+    ]
+
+    assert repo.tables() == []
+    assert repo.tables("username") == [
+        "username.csv",
+        "username-password-recovery-code.xlsx",
+        "username-password-recovery-code.xls",
+    ]
+    assert repo.tables("salary") == ["salary.pdf"]
+
+
+def test_read_csv(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username.csv", "username", delimiter=";")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "First name",
+        "Last name",
+    ]
+    assert len(username.index) == 5
+
+
+def test_fake_read_xlsx(location):
+    repo = FSRepository("example", location)
+    df = pandas.read_excel(
+        location / "username" / "username-password-recovery-code.xls"
+    )
+    print(df)
+
+
+def test_read_xlsx(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username-password-recovery-code.xls", "username")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "One-time password",
+        "Recovery code",
+        "First name",
+        "Last name",
+        "Department",
+        "Location",
+    ]
+    assert len(username.index) == 5
--- a/tests/test_flux.py
+++ b/tests/test_flux.py
@@ -1,6 +1,7 @@
 import pandas as pd
 import pytest

+from dashboard.libs.flux.flux import Flux, consume_flux
 from dashboard.libs.repository.repository import AbstractRepository

 FakeTable = pd.DataFrame
@@ -43,7 +44,17 @@ class FakeRepository(AbstractRepository):

    def write(self, content, table, schema) -> dict[str, str]:
        """Write content into the table"""
-        self._schemas[schema][table]["df"] = content
+        try:
+            self._schemas[schema][table]["df"] = content
+        except KeyError:
+            self._schemas[schema][table] = {
+                "df": content,
+                "metadata": {
+                    "status": "new",
+                    "qty_read": 0,
+                    "qty_write": 0,
+                },
+            }
        self._schemas[schema][table]["metadata"]["status"] = "modified"
        self._schemas[schema][table]["metadata"]["qty_write"] += 1
        return self.infos(table, schema)
@@ -86,3 +97,35 @@ def test_fakerepository():
        "qty_read": 1,
        "qty_write": 1,
    }
+
+
+def test_consume_flux():
+    source_repository = FakeRepository(
+        {
+            "source": {
+                "table1": pd.DataFrame({"A": [1, 2, 3]}),
+            },
+        }
+    )
+    dest_repository = FakeRepository(
+        {
+            "destination": {},
+        }
+    )
+    repositories = {
+        "source": source_repository,
+        "dest": dest_repository,
+    }
+    transformation = lambda dfs: {"dest": dfs[0] * 2}
+
+    flux = Flux(
+        sources=[{"repository": "source", "schema": "source", "table": "table1"}],
+        destinations={
+            "dest": {"repository": "dest", "schema": "destination", "table": "table1"}
+        },
+        transformation=transformation,
+    )
+
+    state = consume_flux(flux, repositories)
+    assert state.statuses["dest"] == {'status': 'modified', 'qty_read': 0, 'qty_write': 1}
+    assert dest_repository.read("table1", "destination").equals(pd.DataFrame({"A": [2, 4, 6]}))
Author	SHA1	Message	Date
Bertrand Benjamin	e794242a03	Feat: test on pandas xlsx and ods file reader	2024-10-07 05:27:46 +02:00
Bertrand Benjamin	5450de8628	Feat: start testing fs_repository	2024-08-14 10:44:38 +02:00
Bertrand Benjamin	08c7fbe4c5	Feat: test consume_flux	2024-08-14 07:41:36 +02:00