diff --git a/plesna/datastore/catalogue.py b/plesna/datastore/catalogue.py deleted file mode 100644 index de436a8..0000000 --- a/plesna/datastore/catalogue.py +++ /dev/null @@ -1 +0,0 @@ -class diff --git a/plesna/datastore/fs_datacatalogue.py b/plesna/datastore/fs_datacatalogue.py index e1945ba..dc4ba42 100644 --- a/plesna/datastore/fs_datacatalogue.py +++ b/plesna/datastore/fs_datacatalogue.py @@ -2,35 +2,33 @@ from pathlib import Path from pydantic import BaseModel, computed_field +from plesna.models.storage import Schema, Table + from .datacatalogue import DataCatalogue -class Schema(BaseModel): +class FSSchema(BaseModel): path: Path @computed_field @property - def id(self) -> str: - return str(self.path) - - @computed_field - @property - def value(self) -> str: - return str(self.path) + def ref(self) -> Schema: + return Schema( + id=str(self.path), + value=str(self.path), + ) -class Table(BaseModel): +class FSTable(BaseModel): path: Path @computed_field @property - def id(self) -> str: - return str(self.path) - - @computed_field - @property - def value(self) -> str: - return str(self.path) + def ref(self) -> Table: + return Table( + id=str(self.path), + value=str(self.path), + ) class FSDataCatalogue(DataCatalogue): @@ -74,13 +72,12 @@ class FSDataCatalogue(DataCatalogue): if not str(f).startswith(".") ] - def schemas(self) -> dict[str, Schema]: + def schemas(self) -> dict[str, FSSchema]: """List schemas (sub directories within basepath)""" subdirectories = self.ls("", only_directories=True, recursive=True) - return {str(path): Schema(path=path) for path in subdirectories} + return {str(path): FSSchema(path=path) for path in subdirectories} - def tables(self, schema_id=".") -> dict[str, Table]: + def tables(self, schema_id=".") -> dict[str, FSTable]: """List table in schema (which are files in the directory)""" schema_path = schema_id - tables = [Table(path=path) for path in self.ls(schema_path, only_files=True)] - return {table.id: table for table in tables} + return {path: FSTable(path=path) for path in self.ls(schema_path, only_files=True)} diff --git a/plesna/models/storage.py b/plesna/models/storage.py new file mode 100644 index 0000000..73d44cc --- /dev/null +++ b/plesna/models/storage.py @@ -0,0 +1,25 @@ +from pydantic import BaseModel + + +class Schema(BaseModel): + """Logical agregation for Table + + id: uniq identifier for the schema + value: string which describe where to find the schema in the storage system + + """ + + id: str + value: str + + +class Table(BaseModel): + """Place where data are stored + + id: uniq identifier for the table + value: string which describe where to find the table in the storage system + + """ + + id: str + value: str diff --git a/tests/datastore/test_fs_datacatalogue.py b/tests/datastore/test_fs_datacatalogue.py index 8f623f1..bb31314 100644 --- a/tests/datastore/test_fs_datacatalogue.py +++ b/tests/datastore/test_fs_datacatalogue.py @@ -42,12 +42,12 @@ def test_init(location): def test_list_schema(location): repo = FSDataCatalogue("example", location) - assert {id: s.model_dump()["id"] for id, s in repo.schemas().items()} == { + assert {id: s.model_dump()["ref"]["id"] for id, s in repo.schemas().items()} == { ".": ".", "username": "username", "salary": "salary", } - assert {id: s.model_dump()["value"] for id, s in repo.schemas().items()} == { + assert {id: s.model_dump()["ref"]["value"] for id, s in repo.schemas().items()} == { ".": ".", "username": "username", "salary": "salary", @@ -62,11 +62,11 @@ def test_list_schema(location): def test_list_tables(location): repo = FSDataCatalogue("example", location) assert repo.tables() == {} - assert {id: t.model_dump()["value"] for id,t in repo.tables("username").items()} == { + assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("username").items()} == { "username.csv": "username.csv", "username-password-recovery-code.xlsx": "username-password-recovery-code.xlsx", "username-password-recovery-code.xls": "username-password-recovery-code.xls", } - assert {id: t.model_dump()["value"] for id,t in repo.tables("salary").items()} == { + assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("salary").items()} == { "salary.pdf": "salary.pdf", } diff --git a/tests/repository/__init__.py b/tests/repository/__init__.py deleted file mode 100644 index e69de29..0000000 diff --git a/tests/repository/fs_examples/salary.pdf b/tests/repository/fs_examples/salary.pdf deleted file mode 100644 index 0cbd711..0000000 Binary files a/tests/repository/fs_examples/salary.pdf and /dev/null differ diff --git a/tests/repository/fs_examples/username-password-recovery-code.xls b/tests/repository/fs_examples/username-password-recovery-code.xls deleted file mode 100644 index af0329b..0000000 Binary files a/tests/repository/fs_examples/username-password-recovery-code.xls and /dev/null differ diff --git a/tests/repository/fs_examples/username-password-recovery-code.xlsx b/tests/repository/fs_examples/username-password-recovery-code.xlsx deleted file mode 100644 index 1e2d2dd..0000000 Binary files a/tests/repository/fs_examples/username-password-recovery-code.xlsx and /dev/null differ diff --git a/tests/repository/fs_examples/username.csv b/tests/repository/fs_examples/username.csv deleted file mode 100644 index 45d43d4..0000000 --- a/tests/repository/fs_examples/username.csv +++ /dev/null @@ -1,7 +0,0 @@ -Username;Identifier;First name;Last name -booker12;9012;Rachel;Booker -grey07;2070;Laura;Grey -johnson81;4081;Craig;Johnson -jenkins46;9346;Mary;Jenkins -smith79;5079;Jamie;Smith - diff --git a/tests/repository/test_fs_repository.py b/tests/repository/test_fs_repository.py deleted file mode 100644 index b6617dd..0000000 --- a/tests/repository/test_fs_repository.py +++ /dev/null @@ -1,84 +0,0 @@ -import shutil -from pathlib import Path - -import pytest -from pandas import pandas - -from dashboard.libs.repository.fs_repository import FSRepository - -EXAMPLE_DIR = "./tests/repository/fs_examples/" - - -@pytest.fixture -def location(tmp_path): - loc = tmp_path - username_loc = loc / "username" - username_loc.mkdir() - salary_loc = loc / "salary" - salary_loc.mkdir() - example_src = Path(EXAMPLE_DIR) - - for f in example_src.glob("*"): - if "username" in str(f): - shutil.copy(f, username_loc) - else: - shutil.copy(f, salary_loc) - - return loc - - -def test_init(location): - repo = FSRepository("example", location) - assert repo.ls() == [ - "username", - "salary", - ] - assert repo.schemas() == [ - ".", - "username", - "salary", - ] - - assert repo.tables() == [] - assert repo.tables("username") == [ - "username.csv", - "username-password-recovery-code.xlsx", - "username-password-recovery-code.xls", - ] - assert repo.tables("salary") == ["salary.pdf"] - - -def test_read_csv(location): - repo = FSRepository("example", location) - username = repo.read("username.csv", "username", delimiter=";") - assert list(username.columns) == [ - "Username", - "Identifier", - "First name", - "Last name", - ] - assert len(username.index) == 5 - - -def test_fake_read_xlsx(location): - repo = FSRepository("example", location) - df = pandas.read_excel( - location / "username" / "username-password-recovery-code.xls" - ) - print(df) - - -def test_read_xlsx(location): - repo = FSRepository("example", location) - username = repo.read("username-password-recovery-code.xls", "username") - assert list(username.columns) == [ - "Username", - "Identifier", - "One-time password", - "Recovery code", - "First name", - "Last name", - "Department", - "Location", - ] - assert len(username.index) == 5