Feat: create fs_datacatalogue

2024-11-05 06:55:05 +01:00
parent 88795fdad3
commit 07fb92e2fa
10 changed files with 47 additions and 117 deletions
--- a/plesna/datastore/catalogue.py
+++ b/plesna/datastore/catalogue.py
@@ -1 +0,0 @@
 class 
--- a/plesna/datastore/fs_datacatalogue.py
+++ b/plesna/datastore/fs_datacatalogue.py
@@ -2,35 +2,33 @@ from pathlib import Path
 from pydantic import BaseModel, computed_field
 from plesna.models.storage import Schema, Table
 from .datacatalogue import DataCatalogue
-class Schema(BaseModel):
+class FSSchema(BaseModel):
    path: Path
    @computed_field
    @property
-    def id(self) -> str:
+    def ref(self) -> Schema:
-        return str(self.path)
+        return Schema(
-
+            id=str(self.path),
-    @computed_field
+            value=str(self.path),
-    @property
+        )
    def value(self) -> str:
        return str(self.path)
-class Table(BaseModel):
+class FSTable(BaseModel):
    path: Path
    @computed_field
    @property
-    def id(self) -> str:
+    def ref(self) -> Table:
-        return str(self.path)
+        return Table(
-
+            id=str(self.path),
-    @computed_field
+            value=str(self.path),
-    @property
+        )
    def value(self) -> str:
        return str(self.path)
 class FSDataCatalogue(DataCatalogue):
@@ -74,13 +72,12 @@ class FSDataCatalogue(DataCatalogue):
            if not str(f).startswith(".")
        ]
-    def schemas(self) -> dict[str, Schema]:
+    def schemas(self) -> dict[str, FSSchema]:
        """List schemas (sub directories within basepath)"""
        subdirectories = self.ls("", only_directories=True, recursive=True)
-        return {str(path): Schema(path=path) for path in subdirectories}
+        return {str(path): FSSchema(path=path) for path in subdirectories}
-    def tables(self, schema_id=".") -> dict[str, Table]:
+    def tables(self, schema_id=".") -> dict[str, FSTable]:
        """List table in schema (which are files in the directory)"""
        schema_path = schema_id
-        tables = [Table(path=path) for path in self.ls(schema_path, only_files=True)]
+        return {path: FSTable(path=path) for path in self.ls(schema_path, only_files=True)}
        return {table.id: table for table in tables}
--- a/plesna/models/storage.py
+++ b/plesna/models/storage.py
@@ -0,0 +1,25 @@
 from pydantic import BaseModel
 class Schema(BaseModel):
    """Logical agregation for Table
    id: uniq identifier for the schema
    value: string which describe where to find the schema in the storage system
    """
    id: str
    value: str
 class Table(BaseModel):
    """Place where data are stored
    id: uniq identifier for the table
    value: string which describe where to find the table in the storage system
    """
    id: str
    value: str
--- a/tests/datastore/test_fs_datacatalogue.py
+++ b/tests/datastore/test_fs_datacatalogue.py
@@ -42,12 +42,12 @@ def test_init(location):
 def test_list_schema(location):
    repo = FSDataCatalogue("example", location)
-    assert {id: s.model_dump()["id"] for id, s in repo.schemas().items()} == {
+    assert {id: s.model_dump()["ref"]["id"] for id, s in repo.schemas().items()} == {
        ".": ".",
        "username": "username",
        "salary": "salary",
    }
-    assert {id: s.model_dump()["value"] for id, s in repo.schemas().items()} == {
+    assert {id: s.model_dump()["ref"]["value"] for id, s in repo.schemas().items()} == {
        ".": ".",
        "username": "username",
        "salary": "salary",
@@ -62,11 +62,11 @@ def test_list_schema(location):
 def test_list_tables(location):
    repo = FSDataCatalogue("example", location)
    assert repo.tables() == {}
-    assert {id: t.model_dump()["value"] for id,t in repo.tables("username").items()} == {
+    assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("username").items()} == {
        "username.csv": "username.csv",
        "username-password-recovery-code.xlsx": "username-password-recovery-code.xlsx",
        "username-password-recovery-code.xls": "username-password-recovery-code.xls",
    }
-    assert {id: t.model_dump()["value"] for id,t in repo.tables("salary").items()} == {
+    assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("salary").items()} == {
        "salary.pdf": "salary.pdf",
    }
--- a/tests/repository/init.py
+++ b/tests/repository/init.py
--- a/tests/repository/fs_examples/salary.pdf
+++ b/tests/repository/fs_examples/salary.pdf
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
@@ -1,7 +0,0 @@
 Username;Identifier;First name;Last name
 booker12;9012;Rachel;Booker
 grey07;2070;Laura;Grey
 johnson81;4081;Craig;Johnson
 jenkins46;9346;Mary;Jenkins
 smith79;5079;Jamie;Smith
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -1,84 +0,0 @@
 import shutil
 from pathlib import Path
 import pytest
 from pandas import pandas
 from dashboard.libs.repository.fs_repository import FSRepository
 EXAMPLE_DIR = "./tests/repository/fs_examples/"
@pytest.fixture
 def location(tmp_path):
    loc = tmp_path
    username_loc = loc / "username"
    username_loc.mkdir()
    salary_loc = loc / "salary"
    salary_loc.mkdir()
    example_src = Path(EXAMPLE_DIR)
    for f in example_src.glob("*"):
        if "username" in str(f):
            shutil.copy(f, username_loc)
        else:
            shutil.copy(f, salary_loc)
    return loc
 def test_init(location):
    repo = FSRepository("example", location)
    assert repo.ls() == [
        "username",
        "salary",
    ]
    assert repo.schemas() == [
        ".",
        "username",
        "salary",
    ]
    assert repo.tables() == []
    assert repo.tables("username") == [
        "username.csv",
        "username-password-recovery-code.xlsx",
        "username-password-recovery-code.xls",
    ]
    assert repo.tables("salary") == ["salary.pdf"]
 def test_read_csv(location):
    repo = FSRepository("example", location)
    username = repo.read("username.csv", "username", delimiter=";")
    assert list(username.columns) == [
        "Username",
        "Identifier",
        "First name",
        "Last name",
    ]
    assert len(username.index) == 5
 def test_fake_read_xlsx(location):
    repo = FSRepository("example", location)
    df = pandas.read_excel(
        location / "username" / "username-password-recovery-code.xls"
    )
    print(df)
 def test_read_xlsx(location):
    repo = FSRepository("example", location)
    username = repo.read("username-password-recovery-code.xls", "username")
    assert list(username.columns) == [
        "Username",
        "Identifier",
        "One-time password",
        "Recovery code",
        "First name",
        "Last name",
        "Department",
        "Location",
    ]
    assert len(username.index) == 5