diff --git a/plesna/datastore/datacatalogue.py b/plesna/datastore/datacatalogue.py index bde7398..eb9cf9d 100644 --- a/plesna/datastore/datacatalogue.py +++ b/plesna/datastore/datacatalogue.py @@ -1,18 +1,31 @@ import abc +from plesna.models.storage import Schema, Table + class DataCatalogue: def __init__(self): pass + @property @abc.abstractmethod - def schemas(self) -> dict[str:str]: - """List schemas""" + def schemas(self) -> list[str]: + """List schema's names""" raise NotImplementedError @abc.abstractmethod - def tables(self, schema) -> dict[str:str]: - """List table in schema""" + def schema(self, name: str) -> Schema: + """Get the schema properties""" + raise NotImplementedError + + @abc.abstractmethod + def tables(self, schema:str) -> list[str]: + """List table's name in schema""" + raise NotImplementedError + + @abc.abstractmethod + def table(self, schema:str, table:str) -> Table: + """Get the table properties""" raise NotImplementedError @abc.abstractmethod diff --git a/plesna/datastore/fs_datacatalogue.py b/plesna/datastore/fs_datacatalogue.py index dc4ba42..30a2c26 100644 --- a/plesna/datastore/fs_datacatalogue.py +++ b/plesna/datastore/fs_datacatalogue.py @@ -7,18 +7,6 @@ from plesna.models.storage import Schema, Table from .datacatalogue import DataCatalogue -class FSSchema(BaseModel): - path: Path - - @computed_field - @property - def ref(self) -> Schema: - return Schema( - id=str(self.path), - value=str(self.path), - ) - - class FSTable(BaseModel): path: Path @@ -31,6 +19,20 @@ class FSTable(BaseModel): ) +class FSSchema(BaseModel): + path: Path + tables: list[str] + + @computed_field + @property + def ref(self) -> Schema: + return Schema( + id=str(self.path), + value=str(self.path), + ) + + + class FSDataCatalogue(DataCatalogue): """DataCatalogue based on files tree structure""" @@ -72,12 +74,18 @@ class FSDataCatalogue(DataCatalogue): if not str(f).startswith(".") ] - def schemas(self) -> dict[str, FSSchema]: + @property + def schemas(self) -> list[str]: """List schemas (sub directories within basepath)""" subdirectories = self.ls("", only_directories=True, recursive=True) - return {str(path): FSSchema(path=path) for path in subdirectories} + return [str(d) for d in subdirectories] - def tables(self, schema_id=".") -> dict[str, FSTable]: + def schema(self, schema: str) -> FSSchema: + """List schemas (sub directories within basepath)""" + tables = self.ls(schema, only_files=True) + return FSSchema(path=Path(schema), tables=tables) + + def table(self, schema: str, table:str) -> FSTable: """List table in schema (which are files in the directory)""" schema_path = schema_id return {path: FSTable(path=path) for path in self.ls(schema_path, only_files=True)} diff --git a/tests/datastore/test_fs_datacatalogue.py b/tests/datastore/test_fs_datacatalogue.py index 47257c9..c2e2844 100644 --- a/tests/datastore/test_fs_datacatalogue.py +++ b/tests/datastore/test_fs_datacatalogue.py @@ -4,6 +4,7 @@ from pathlib import Path import pytest from plesna.datastore.fs_datacatalogue import FSDataCatalogue +from plesna.models.storage import Schema FIXTURE_DIR = Path(__file__).parent.parent / Path("./raw_datas/") @@ -42,31 +43,19 @@ def test_init(location): def test_list_schema(location): repo = FSDataCatalogue("example", location) - assert {id: s.model_dump()["ref"]["id"] for id, s in repo.schemas().items()} == { - ".": ".", - "username": "username", - "salary": "salary", - } - assert {id: s.model_dump()["ref"]["value"] for id, s in repo.schemas().items()} == { - ".": ".", - "username": "username", - "salary": "salary", - } - assert {id: s.model_dump()["path"] for id, s in repo.schemas().items()} == { - ".": Path("."), - "username": Path("username"), - "salary": Path("salary"), - } + assert repo.schemas == [".", "username", "salary"] + assert repo.schema(".").ref == Schema(id=".", value=".") + assert repo.schema("username").ref == Schema(id="username", value="username") -def test_list_tables(location): +def test_list_tables_schema(location): repo = FSDataCatalogue("example", location) - assert repo.tables() == {} - assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("username").items()} == { - "username.csv": "username.csv", - "username-password-recovery-code.xlsx": "username-password-recovery-code.xlsx", - "username-password-recovery-code.xls": "username-password-recovery-code.xls", - } - assert {id: t.model_dump()["ref"]["value"] for id,t in repo.tables("salary").items()} == { - "salary.pdf": "salary.pdf", - } + + assert repo.schema(".").tables == [] + assert repo.schema("username").tables == [ + 'username.csv', + 'username-password-recovery-code.xlsx', + 'username-password-recovery-code.xls', + ] + assert repo.schema("salary").tables == ["salary.pdf"] +