Feat: test on pandas xlsx and ods file reader

Feat: start testing fs_repository
Feat: test consume_flux
2024-10-07 05:27:46 +02:00 · 2024-08-14 10:44:38 +02:00 · 2024-08-14 07:41:36 +02:00 · 2024-08-14 07:22:01 +02:00 · 2024-08-14 07:21:36 +02:00
15 changed files with 352 additions and 73 deletions
--- a/dashboard/init.py
+++ b/dashboard/init.py
--- a/dashboard/libs/flux/init.py
+++ b/dashboard/libs/flux/init.py
--- a/dashboard/libs/flux/flux.py
+++ b/dashboard/libs/flux/flux.py
@@ -0,0 +1,70 @@
+from collections.abc import Callable
+from datetime import datetime
+
+import pandas as pd
+from pydantic import BaseModel
+
+from ..repository.repository import AbstractRepository
+
+
+class Schema(BaseModel):
+    repository: str
+    schema: str
+
+
+class Table(BaseModel):
+    repository: str
+    schema: str
+    table: str
+
+
+class Flux(BaseModel):
+    sources: list[Table]
+    destinations: dict[str, Table]
+    transformation: Callable[[list[pd.DataFrame]], dict[str, pd.DataFrame]]
+
+
+class State(BaseModel):
+    statuses: dict[str, dict]
+    qty_out: int
+    failed_lines: list[str]
+    start: datetime
+    end: datetime
+
+
+Repositories = dict[str, AbstractRepository]
+
+
+def open_source(repositories: Repositories, source: Table) -> pd.DataFrame:
+    return repositories[source.repository].read(source.table, source.schema)
+
+
+def write_source(
+    content: pd.DataFrame, repositories: Repositories, destination: Table
+) -> str:
+    return repositories[destination.repository].write(
+        content, destination.table, destination.schema
+    )
+
+
+def consume_flux(flux: Flux, repositories: dict[str, AbstractRepository]) -> State:
+    start = datetime.now()
+    src_dfs = [open_source(repositories, source) for source in flux.sources]
+
+    built_dfs = flux.transformation(src_dfs)
+
+    statuses = {
+        dest: write_source(df, repositories, flux.destinations[dest])
+        for dest, df in built_dfs.items()
+    }
+
+    end = datetime.now()
+    qty_out = 0
+    failed_lines = []
+    return State(
+        statuses=statuses,
+        qty_out=qty_out,
+        failed_lines=failed_lines,
+        start=start,
+        end=end,
+    )
--- a/dashboard/libs/fs_schema.py
+++ b/dashboard/libs/fs_schema.py
@@ -1,35 +0,0 @@
-from .schema import AbstractSchema
-from pathlib import Path
-
-class FSSchema(AbstractSchema):
-    def __init__(self, basepath, metadata_engine=None):
-        self.basepath = basepath
-        self._metadata_engine = metadata_engine
-
-    def ls(self, dir, only_files=True):
-        dirpath = Path(dir)
-        if only_files:
-            return [f for f in dirpath.iterdir() if f.is_dir()]
-        return [f for f in dirpath.iterdir()]
-
-    def tables(self, dir, only_files=True):
-        dirpath = Path(dir)
-        if only_files:
-            return [f for f in dirpath.iterdir() if f.is_dir()]
-        return [f for f in dirpath.iterdir()]
-
-    def info(self, path):
-        path = Path(path)
-        pass
-
-    def read(self, path):
-        path = Path(path)
-        pass
-
-    def write(self, path, content):
-        path = Path(path)
-        pass
-
-    def delete(self, path):
-        path = Path(path)
-        pass
--- a/dashboard/libs/repository/fs_repository.py
+++ b/dashboard/libs/repository/fs_repository.py
@@ -4,61 +4,83 @@ import pandas as pd

 from .repository import AbstractRepository

+ACCEPTABLE_EXTENTIONS = {
+    "csv": [".csv"],
+    "excel": [".xls", ".xlsx"],
+}

 class FSRepository(AbstractRepository):
    def __init__(self, name, basepath, metadata_engine=None):
        self.name = name

        self.basepath = Path(basepath)
+        assert self.basepath.exists()
        self._metadata_engine = metadata_engine

-    def ls(self, dir, only_files=False, only_directories=False, recursive=False) -> list[str]:
-        dirpath = Path(dir)
+    def ls(
+        self, dir="", only_files=False, only_directories=False, recursive=False
+    ) -> list[str]:
+        dirpath = self.basepath / dir

        if only_files:
-            return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir()]
+            return [
+                str(f.relative_to(dirpath))
+                for f in dirpath.iterdir()
+                if not f.is_dir() and not str(f).startswith(".")
+            ]

        if only_directories:
            if recursive:
-                return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()]
+                return [
+                    str(f[0].relative_to(dirpath))
+                    for f in dirpath.walk()
+                    if not str(f).startswith(".")
+                ]

-            return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir()]
+            return [
+                str(f.relative_to(dirpath))
+                for f in dirpath.iterdir()
+                if f.is_dir() and not str(f).startswith(".")
+            ]

-        return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()]
+        return [
+            str(f.relative_to(dirpath))
+            for f in dirpath.iterdir()
+            if not str(f).startswith(".")
+        ]

    def schemas(self, recursive=True) -> list[str]:
-        dirpath = self.basepath
-        return self.ls(dirpath, only_directories=True, recursive=True)
+        return self.ls("", only_directories=True, recursive=True)

-    def tables(self, schema:str) -> list[str]:
-        dirpath = self.basepath / schema
-        return self.ls(dirpath, only_files=True)
+    def tables(self, schema: str = ".") -> list[str]:
+        return self.ls(schema, only_files=True)

-    def build_table_path(self, table:str, schema:str):
+    def build_table_path(self, table: str, schema: str):
        table_path = self.basepath
-        if schema == '.':
+        if schema == ".":
            return table_path / table
        return table_path / schema / table

-    def info(self, table:str, schema:str='.'):
+    def infos(self, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass

-    def read(self, table:str, schema:str='.', read_options={}):
+    def read(self, table: str, schema: str = ".", **read_options):
        table_path = self.build_table_path(table, schema)
+        assert table_path.exists()
        extension = table_path.suffix
-        if extension == '.csv':
+        if extension in ACCEPTABLE_EXTENTIONS["csv"]:
            return pd.read_csv(table_path, **read_options)

-        if extension == '.xlsx':
-            return pd.read_excel(table_path, **read_options)
+        if extension in ACCEPTABLE_EXTENTIONS["excel"]:
+            return pd.read_excel(table_path, engine = "openpyxl", **read_options)

-        raise ValueError("Can't open the table")
+        raise ValueError("Bad extention. Can't open the table.")

-    def write(self, table:str, content, schema:str='.'):
+    def write(self, content, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass

-    def delete(self, table:str, schema:str='.'):
+    def delete_table(self, table: str, schema: str = "."):
        table_path = self.build_table_path(table, schema)
        pass
--- a/dashboard/libs/repository/repository.py
+++ b/dashboard/libs/repository/repository.py
@@ -1,4 +1,5 @@
 import abc
+
 from .metadata import AbstractMetadataEngine


@@ -6,31 +7,31 @@ class AbstractRepository(abc.ABC):
    metadata_engine = AbstractMetadataEngine

    @abc.abstractmethod
-    def schemas():
-        """ List schemas """
+    def schemas(self) -> list[str]:
+        """List schemas"""
        raise NotImplementedError

    @abc.abstractmethod
-    def tables(schema):
-        """ List table in schema"""
+    def tables(self, schema) -> list[str]:
+        """List table in schema"""
        raise NotImplementedError

    @abc.abstractmethod
-    def info(self, path):
-        """ Get infos about a file"""
+    def infos(self, table: str, schema: str) -> dict[str, str]:
+        """Get infos about the table"""
        raise NotImplementedError

    @abc.abstractmethod
-    def read(self, path):
-        """ Get content of a file"""
+    def read(self, table: str, schema: str):
+        """Get content of the table"""
        raise NotImplementedError

    @abc.abstractmethod
-    def write(self, path, content):
-        """ Write content into the file"""
+    def write(self, content, table: str, schema: str):
+        """Write content into the table"""
        raise NotImplementedError

    @abc.abstractmethod
-    def delete(self, path):
-        """ Delete the file """
+    def delete_table(self, table: str, schema: str):
+        """Delete the table"""
        raise NotImplementedError
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,6 @@
 jupyter==1.0.0 
-pandas==1.5.0
-pdf-oralia==0.3.11
-pydantic==2.6.1
+pandas==2.2.2
+pydantic==2.8.2
 click==8.1.7
-dlt[duckdb]>=0.4.3a0
-openpyxl>=3.0.0
+openpyxl==3.1.5
+xlrd==2.0.1
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/repository/init.py
+++ b/tests/repository/init.py
--- a/tests/repository/fs_examples/salary.pdf
+++ b/tests/repository/fs_examples/salary.pdf
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
@@ -0,0 +1,7 @@
+Username;Identifier;First name;Last name
+booker12;9012;Rachel;Booker
+grey07;2070;Laura;Grey
+johnson81;4081;Craig;Johnson
+jenkins46;9346;Mary;Jenkins
+smith79;5079;Jamie;Smith
+
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -0,0 +1,84 @@
+import shutil
+from pathlib import Path
+
+import pytest
+from pandas import pandas
+
+from dashboard.libs.repository.fs_repository import FSRepository
+
+EXAMPLE_DIR = "./tests/repository/fs_examples/"
+
+
+@pytest.fixture
+def location(tmp_path):
+    loc = tmp_path
+    username_loc = loc / "username"
+    username_loc.mkdir()
+    salary_loc = loc / "salary"
+    salary_loc.mkdir()
+    example_src = Path(EXAMPLE_DIR)
+
+    for f in example_src.glob("*"):
+        if "username" in str(f):
+            shutil.copy(f, username_loc)
+        else:
+            shutil.copy(f, salary_loc)
+
+    return loc
+
+
+def test_init(location):
+    repo = FSRepository("example", location)
+    assert repo.ls() == [
+        "username",
+        "salary",
+    ]
+    assert repo.schemas() == [
+        ".",
+        "username",
+        "salary",
+    ]
+
+    assert repo.tables() == []
+    assert repo.tables("username") == [
+        "username.csv",
+        "username-password-recovery-code.xlsx",
+        "username-password-recovery-code.xls",
+    ]
+    assert repo.tables("salary") == ["salary.pdf"]
+
+
+def test_read_csv(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username.csv", "username", delimiter=";")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "First name",
+        "Last name",
+    ]
+    assert len(username.index) == 5
+
+
+def test_fake_read_xlsx(location):
+    repo = FSRepository("example", location)
+    df = pandas.read_excel(
+        location / "username" / "username-password-recovery-code.xls"
+    )
+    print(df)
+
+
+def test_read_xlsx(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username-password-recovery-code.xls", "username")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "One-time password",
+        "Recovery code",
+        "First name",
+        "Last name",
+        "Department",
+        "Location",
+    ]
+    assert len(username.index) == 5
--- a/tests/test_flux.py
+++ b/tests/test_flux.py
@@ -0,0 +1,131 @@
+import pandas as pd
+import pytest
+
+from dashboard.libs.flux.flux import Flux, consume_flux
+from dashboard.libs.repository.repository import AbstractRepository
+
+FakeTable = pd.DataFrame
+FakeSchema = dict[str, pd.DataFrame]
+FakeSchemas = dict[str, FakeSchema]
+
+
+class FakeRepository(AbstractRepository):
+    def __init__(self, schemas: FakeSchemas):
+        self._schemas = {}
+        for schema_name, tables in schemas.items():
+            schema = {}
+            for table, df in tables.items():
+                schema[table] = {
+                    "df": df,
+                    "metadata": {
+                        "status": "new",
+                        "qty_read": 0,
+                        "qty_write": 0,
+                    },
+                }
+            self._schemas[schema_name] = schema
+
+    def schemas(self):
+        """List schemas"""
+        return list(self._schemas.keys())
+
+    def tables(self, schema):
+        """List table's name in schema"""
+        return list(self._schemas[schema].keys())
+
+    def infos(self, table: str, schema: str) -> dict[str, str]:
+        """Get infos about the table"""
+        return self._schemas[schema][table]["metadata"]
+
+    def read(self, table, schema) -> pd.DataFrame:
+        """Get content of the table"""
+        self._schemas[schema][table]["metadata"]["qty_read"] += 1
+        return self._schemas[schema][table]["df"]
+
+    def write(self, content, table, schema) -> dict[str, str]:
+        """Write content into the table"""
+        try:
+            self._schemas[schema][table]["df"] = content
+        except KeyError:
+            self._schemas[schema][table] = {
+                "df": content,
+                "metadata": {
+                    "status": "new",
+                    "qty_read": 0,
+                    "qty_write": 0,
+                },
+            }
+        self._schemas[schema][table]["metadata"]["status"] = "modified"
+        self._schemas[schema][table]["metadata"]["qty_write"] += 1
+        return self.infos(table, schema)
+
+    def delete_table(self, table, schema):
+        """Delete the table"""
+        raise NotImplementedError
+
+
+def test_fakerepository():
+    fakerepository = FakeRepository(
+        {
+            "foo": {
+                "table1": pd.DataFrame({"A": []}),
+                "table2": pd.DataFrame({"B": []}),
+            },
+            "bar": {
+                "table1": pd.DataFrame({"C": []}),
+                "table2": pd.DataFrame({"D": []}),
+            },
+        }
+    )
+    assert fakerepository.schemas() == ["foo", "bar"]
+    assert fakerepository.tables("foo") == ["table1", "table2"]
+    assert fakerepository.infos("table1", "foo") == {
+        "status": "new",
+        "qty_read": 0,
+        "qty_write": 0,
+    }
+    assert fakerepository.read("table1", "foo").equals(pd.DataFrame({"A": []}))
+    assert fakerepository.infos("table1", "foo") == {
+        "status": "new",
+        "qty_read": 1,
+        "qty_write": 0,
+    }
+
+    df = pd.DataFrame({"A": [1, 2]})
+    assert fakerepository.write(df, "table1", "foo") == {
+        "status": "modified",
+        "qty_read": 1,
+        "qty_write": 1,
+    }
+
+
+def test_consume_flux():
+    source_repository = FakeRepository(
+        {
+            "source": {
+                "table1": pd.DataFrame({"A": [1, 2, 3]}),
+            },
+        }
+    )
+    dest_repository = FakeRepository(
+        {
+            "destination": {},
+        }
+    )
+    repositories = {
+        "source": source_repository,
+        "dest": dest_repository,
+    }
+    transformation = lambda dfs: {"dest": dfs[0] * 2}
+
+    flux = Flux(
+        sources=[{"repository": "source", "schema": "source", "table": "table1"}],
+        destinations={
+            "dest": {"repository": "dest", "schema": "destination", "table": "table1"}
+        },
+        transformation=transformation,
+    )
+
+    state = consume_flux(flux, repositories)
+    assert state.statuses["dest"] == {'status': 'modified', 'qty_read': 0, 'qty_write': 1}
+    assert dest_repository.read("table1", "destination").equals(pd.DataFrame({"A": [2, 4, 6]}))
Author	SHA1	Message	Date
Bertrand Benjamin	e794242a03	Feat: test on pandas xlsx and ods file reader	2024-10-07 05:27:46 +02:00
Bertrand Benjamin	5450de8628	Feat: start testing fs_repository	2024-08-14 10:44:38 +02:00
Bertrand Benjamin	08c7fbe4c5	Feat: test consume_flux	2024-08-14 07:41:36 +02:00
Bertrand Benjamin	959b53e6a0	Feat: start flux	2024-08-14 07:22:01 +02:00
Bertrand Benjamin	91e229eab2	Feat: add __init__ and mod function signature	2024-08-14 07:21:36 +02:00