Feat: create fs_datacatalogue

2024-11-05 06:55:05 +01:00
parent 88795fdad3
commit 07fb92e2fa
10 changed files with 47 additions and 117 deletions
--- a/plesna/datastore/catalogue.py
+++ b/plesna/datastore/catalogue.py
@@ -1 +0,0 @@
-class 
--- a/plesna/datastore/fs_datacatalogue.py
+++ b/plesna/datastore/fs_datacatalogue.py
@@ -2,35 +2,33 @@ from pathlib import Path

 from pydantic import BaseModel, computed_field

+from plesna.models.storage import Schema, Table
+
 from .datacatalogue import DataCatalogue


-class Schema(BaseModel):
+class FSSchema(BaseModel):
    path: Path

    @computed_field
    @property
-    def id(self) -> str:
-        return str(self.path)
-
-    @computed_field
-    @property
-    def value(self) -> str:
-        return str(self.path)
+    def ref(self) -> Schema:
+        return Schema(
+            id=str(self.path),
+            value=str(self.path),
+        )


-class Table(BaseModel):
+class FSTable(BaseModel):
    path: Path

    @computed_field
    @property
-    def id(self) -> str:
-        return str(self.path)
-
-    @computed_field
-    @property
-    def value(self) -> str:
-        return str(self.path)
+    def ref(self) -> Table:
+        return Table(
+            id=str(self.path),
+            value=str(self.path),
+        )


 class FSDataCatalogue(DataCatalogue):
@@ -74,13 +72,12 @@ class FSDataCatalogue(DataCatalogue):
            if not str(f).startswith(".")
        ]

-    def schemas(self) -> dict[str, Schema]:
+    def schemas(self) -> dict[str, FSSchema]:
        """List schemas (sub directories within basepath)"""
        subdirectories = self.ls("", only_directories=True, recursive=True)
-        return {str(path): Schema(path=path) for path in subdirectories}
+        return {str(path): FSSchema(path=path) for path in subdirectories}

-    def tables(self, schema_id=".") -> dict[str, Table]:
+    def tables(self, schema_id=".") -> dict[str, FSTable]:
        """List table in schema (which are files in the directory)"""
        schema_path = schema_id
-        tables = [Table(path=path) for path in self.ls(schema_path, only_files=True)]
-        return {table.id: table for table in tables}
+        return {path: FSTable(path=path) for path in self.ls(schema_path, only_files=True)}
--- a/plesna/models/storage.py
+++ b/plesna/models/storage.py
@@ -0,0 +1,25 @@
+from pydantic import BaseModel
+
+
+class Schema(BaseModel):
+    """Logical agregation for Table
+
+    id: uniq identifier for the schema
+    value: string which describe where to find the schema in the storage system
+
+    """
+
+    id: str
+    value: str
+
+
+class Table(BaseModel):
+    """Place where data are stored
+
+    id: uniq identifier for the table
+    value: string which describe where to find the table in the storage system
+
+    """
+
+    id: str
+    value: str
--- a/tests/datastore/test_fs_datacatalogue.py
+++ b/tests/datastore/test_fs_datacatalogue.py
@@ -42,12 +42,12 @@ def test_init(location):

 def test_list_schema(location):
    repo = FSDataCatalogue("example", location)
-    assert {id: s.model_dump()["id"] for id, s in repo.schemas().items()} == {
+    assert {id: s.model_dump()["ref"]["id"] for id, s in repo.schemas().items()} == {
        ".": ".",
        "username": "username",
        "salary": "salary",
    }
-    assert {id: s.model_dump()["value"] for id, s in repo.schemas().items()} == {
+    assert {id: s.model_dump()["ref"]["value"] for id, s in repo.schemas().items()} == {
        ".": ".",
        "username": "username",
        "salary": "salary",
@@ -62,11 +62,11 @@ def test_list_schema(location):
 def test_list_tables(location):
    repo = FSDataCatalogue("example", location)
    assert repo.tables() == {}
-    assert {id: t.model_dump()["value"] for id,t in repo.tables("username").items()} == {
+    assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("username").items()} == {
        "username.csv": "username.csv",
        "username-password-recovery-code.xlsx": "username-password-recovery-code.xlsx",
        "username-password-recovery-code.xls": "username-password-recovery-code.xls",
    }
-    assert {id: t.model_dump()["value"] for id,t in repo.tables("salary").items()} == {
+    assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("salary").items()} == {
        "salary.pdf": "salary.pdf",
    }
--- a/tests/repository/init.py
+++ b/tests/repository/init.py
--- a/tests/repository/fs_examples/salary.pdf
+++ b/tests/repository/fs_examples/salary.pdf
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
@@ -1,7 +0,0 @@
-Username;Identifier;First name;Last name
-booker12;9012;Rachel;Booker
-grey07;2070;Laura;Grey
-johnson81;4081;Craig;Johnson
-jenkins46;9346;Mary;Jenkins
-smith79;5079;Jamie;Smith
-
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -1,84 +0,0 @@
-import shutil
-from pathlib import Path
-
-import pytest
-from pandas import pandas
-
-from dashboard.libs.repository.fs_repository import FSRepository
-
-EXAMPLE_DIR = "./tests/repository/fs_examples/"
-
-
-@pytest.fixture
-def location(tmp_path):
-    loc = tmp_path
-    username_loc = loc / "username"
-    username_loc.mkdir()
-    salary_loc = loc / "salary"
-    salary_loc.mkdir()
-    example_src = Path(EXAMPLE_DIR)
-
-    for f in example_src.glob("*"):
-        if "username" in str(f):
-            shutil.copy(f, username_loc)
-        else:
-            shutil.copy(f, salary_loc)
-
-    return loc
-
-
-def test_init(location):
-    repo = FSRepository("example", location)
-    assert repo.ls() == [
-        "username",
-        "salary",
-    ]
-    assert repo.schemas() == [
-        ".",
-        "username",
-        "salary",
-    ]
-
-    assert repo.tables() == []
-    assert repo.tables("username") == [
-        "username.csv",
-        "username-password-recovery-code.xlsx",
-        "username-password-recovery-code.xls",
-    ]
-    assert repo.tables("salary") == ["salary.pdf"]
-
-
-def test_read_csv(location):
-    repo = FSRepository("example", location)
-    username = repo.read("username.csv", "username", delimiter=";")
-    assert list(username.columns) == [
-        "Username",
-        "Identifier",
-        "First name",
-        "Last name",
-    ]
-    assert len(username.index) == 5
-
-
-def test_fake_read_xlsx(location):
-    repo = FSRepository("example", location)
-    df = pandas.read_excel(
-        location / "username" / "username-password-recovery-code.xls"
-    )
-    print(df)
-
-
-def test_read_xlsx(location):
-    repo = FSRepository("example", location)
-    username = repo.read("username-password-recovery-code.xls", "username")
-    assert list(username.columns) == [
-        "Username",
-        "Identifier",
-        "One-time password",
-        "Recovery code",
-        "First name",
-        "Last name",
-        "Department",
-        "Location",
-    ]
-    assert len(username.index) == 5