From 8a43a93cda44e749b1241a8cfc3e6a799c270959 Mon Sep 17 00:00:00 2001 From: Bertrand Benjamin Date: Sun, 5 Jan 2025 15:13:38 +0100 Subject: [PATCH] refact: repo id are not based on path but on id --- plesna/storage/repository/fs_repository.py | 59 ++++++++++------------ tests/storage/test_fs_repository.py | 54 ++++++++++---------- 2 files changed, 53 insertions(+), 60 deletions(-) diff --git a/plesna/storage/repository/fs_repository.py b/plesna/storage/repository/fs_repository.py index 7b463b2..f64d624 100644 --- a/plesna/storage/repository/fs_repository.py +++ b/plesna/storage/repository/fs_repository.py @@ -7,25 +7,10 @@ from plesna.models.storage import Partition, Schema, Table from plesna.storage.repository.repository import Repository -class FSPartition(BaseModel): - name: str - path: Path - - @computed_field - @property - def ref(self) -> Partition: - return Partition( - id=str(self.path), - repo_id=str(self.path.parent.parent.parent), - schema_id=str(self.path.parent.parent), - table_id=str(self.path.parent), - name=self.name, - value=str(self.path.absolute()), - ) - - class FSTable(BaseModel): name: str + repo_id: str + schema_id: str id: str path: Path is_partitionned: bool @@ -41,8 +26,8 @@ class FSTable(BaseModel): return Table( id=self.id, - repo_id=str(self.path.parent.parent), - schema_id=str(self.path.parent), + repo_id=self.repo_id, + schema_id=self.schema_id, name=self.name, value=str(self.path.absolute()), partitions=self.partitions, @@ -52,6 +37,7 @@ class FSTable(BaseModel): class FSSchema(BaseModel): name: str + repo_id: str id: str path: Path tables: list[str] @@ -61,7 +47,7 @@ class FSSchema(BaseModel): def ref(self) -> Schema: return Schema( id=self.id, - repo_id=str(self.path.parent), + repo_id=self.repo_id, name=self.name, value=str(self.path.absolute()), tables=self.tables, @@ -82,8 +68,8 @@ class FSRepository(Repository): """ ID_FMT = { - "schema": "{repo_name}-{schema_name}", - "table": "{repo_name}-{schema_name}-{table_name}", + "schema": "{repo_id}-{schema_name}", + "table": "{schema_id}-{table_name}", } def __init__(self, id: str, name: str, basepath: str): @@ -139,22 +125,29 @@ class FSRepository(Repository): """List schemas (sub directories within basepath)""" subdirectories = self.ls("", only_directories=True) return [ - self.ID_FMT["schema"].format(repo_name=self.name, schema_name=d) for d in subdirectories + self.ID_FMT["schema"].format(repo_id=self.id, schema_name=d) for d in subdirectories ] def _schema(self, schema_id: str) -> FSSchema: """List schemas (sub directories within basepath)""" parsed = self.parse_id(schema_id, "schema") - repo_name = parsed["repo_name"] + repo_id = parsed["repo_id"] schema_name = parsed["schema_name"] schema_path = self._basepath / schema_name - if repo_name != self.name: + if repo_id != self.id: raise FSRepositoryError("Trying to get schema that don't belong in this repository") tables = self.tables(schema_id) - return FSSchema(name=schema_name, id=schema_id, path=schema_path, tables=tables) + return FSSchema( + name=schema_name, + id=schema_id, + repo_id=self.id, + schema_id=schema_id, + path=schema_path, + tables=tables, + ) def schema(self, schema_id: str) -> Schema: return self._schema(schema_id).ref @@ -162,7 +155,7 @@ class FSRepository(Repository): def _tables(self, schema_id: str) -> list[str]: parsed = self.parse_id(schema_id, "schema") tables = self.ls(parsed["schema_name"]) - return [self.ID_FMT["table"].format(table_name=t, **parsed) for t in tables] + return [self.ID_FMT["table"].format(table_name=t, schema_id=schema_id) for t in tables] def tables(self, schema_id: str = "") -> list[str]: if schema_id: @@ -176,14 +169,12 @@ class FSRepository(Repository): def _table(self, table_id: str) -> FSTable: """Get infos on the table""" parsed = self.parse_id(table_id, "table") - if parsed["repo_name"] != self.name: - raise FSRepositoryError("Trying to get table that don't belong in this repository") + schema = self._schema(parsed["schema_id"]) - schema_path = self._basepath / parsed["schema_name"] - if not schema_path.exists(): - raise FSRepositoryError(f"The schema {parsed['schema_name']} does not exists.") + if not schema.path.exists(): + raise FSRepositoryError(f"The schema {schema.id} does not exists.") - table_subpath = f"{parsed['schema_name']}/{parsed['table_name']}" + table_subpath = f"{schema.name}/{parsed['table_name']}" table_path = self._basepath / table_subpath is_partitionned = table_path.is_dir() @@ -195,6 +186,8 @@ class FSRepository(Repository): return FSTable( name=parsed["table_name"], id=table_id, + repo_id=self.id, + schema_id=schema.id, path=table_path, is_partitionned=is_partitionned, partitions=partitions, diff --git a/tests/storage/test_fs_repository.py b/tests/storage/test_fs_repository.py index a40aa95..434e30e 100644 --- a/tests/storage/test_fs_repository.py +++ b/tests/storage/test_fs_repository.py @@ -45,50 +45,50 @@ def test_init(location): @pytest.fixture def repository(location) -> FSRepository: - return FSRepository("example", "example", location) + return FSRepository("repo_id", "example", location) def test_list_schemas(repository): - assert repository.schemas() == ["example-schema"] + assert repository.schemas() == ["repo_id-schema"] def test_describe_schema(location, repository): - schema = repository.schema("example-schema") + schema = repository.schema("repo_id-schema") assert schema.name == "schema" - assert schema.id == "example-schema" - assert schema.repo_id == str(location) + assert schema.id == "repo_id-schema" + assert schema.repo_id == "repo_id" assert schema.value == str(location / "schema") assert schema.tables == [ - "example-schema-username", - "example-schema-recovery", - "example-schema-salary", + "repo_id-schema-username", + "repo_id-schema-recovery", + "repo_id-schema-salary", ] def test_list_tables_schema(repository): - assert repository.schema("example-schema").tables == [ - "example-schema-username", - "example-schema-recovery", - "example-schema-salary", + assert repository.schema("repo_id-schema").tables == [ + "repo_id-schema-username", + "repo_id-schema-recovery", + "repo_id-schema-salary", ] - assert repository.tables("example-schema") == [ - "example-schema-username", - "example-schema-recovery", - "example-schema-salary", + assert repository.tables("repo_id-schema") == [ + "repo_id-schema-username", + "repo_id-schema-recovery", + "repo_id-schema-salary", ] assert repository.tables() == [ - "example-schema-username", - "example-schema-recovery", - "example-schema-salary", + "repo_id-schema-username", + "repo_id-schema-recovery", + "repo_id-schema-salary", ] def test_describe_table(location, repository): - table = repository.table("example-schema-username") + table = repository.table("repo_id-schema-username") - assert table.id == "example-schema-username" - assert table.repo_id == str(location) - assert table.schema_id == str(location / "schema") + assert table.id == "repo_id-schema-username" + assert table.repo_id == "repo_id" + assert table.schema_id == "repo_id-schema" assert table.name == "username" assert table.value == str(location / "schema" / "username") assert table.partitions == ["username.csv"] @@ -96,11 +96,11 @@ def test_describe_table(location, repository): def test_describe_table_with_partitions(location, repository): - table = repository.table("example-schema-recovery") + table = repository.table("repo_id-schema-recovery") - assert table.id == "example-schema-recovery" - assert table.repo_id == str(location) - assert table.schema_id == str(location / "schema") + assert table.id == "repo_id-schema-recovery" + assert table.repo_id == "repo_id" + assert table.schema_id == "repo_id-schema" assert table.name == "recovery" assert table.value == str(location / "schema" / "recovery") assert table.partitions == [