From 8a43a93cda44e749b1241a8cfc3e6a799c270959 Mon Sep 17 00:00:00 2001
From: Bertrand Benjamin <benjamin.bertrand@opytex.org>
Date: Sun, 5 Jan 2025 15:13:38 +0100
Subject: [PATCH] refact: repo id are not based on path but on id

---
 plesna/storage/repository/fs_repository.py | 59 ++++++++++------------
 tests/storage/test_fs_repository.py        | 54 ++++++++++----------
 2 files changed, 53 insertions(+), 60 deletions(-)

diff --git a/plesna/storage/repository/fs_repository.py b/plesna/storage/repository/fs_repository.py
index 7b463b2..f64d624 100644
--- a/plesna/storage/repository/fs_repository.py
+++ b/plesna/storage/repository/fs_repository.py
@@ -7,25 +7,10 @@ from plesna.models.storage import Partition, Schema, Table
 from plesna.storage.repository.repository import Repository
 
 
-class FSPartition(BaseModel):
-    name: str
-    path: Path
-
-    @computed_field
-    @property
-    def ref(self) -> Partition:
-        return Partition(
-            id=str(self.path),
-            repo_id=str(self.path.parent.parent.parent),
-            schema_id=str(self.path.parent.parent),
-            table_id=str(self.path.parent),
-            name=self.name,
-            value=str(self.path.absolute()),
-        )
-
-
 class FSTable(BaseModel):
     name: str
+    repo_id: str
+    schema_id: str
     id: str
     path: Path
     is_partitionned: bool
@@ -41,8 +26,8 @@ class FSTable(BaseModel):
 
         return Table(
             id=self.id,
-            repo_id=str(self.path.parent.parent),
-            schema_id=str(self.path.parent),
+            repo_id=self.repo_id,
+            schema_id=self.schema_id,
             name=self.name,
             value=str(self.path.absolute()),
             partitions=self.partitions,
@@ -52,6 +37,7 @@ class FSTable(BaseModel):
 
 class FSSchema(BaseModel):
     name: str
+    repo_id: str
     id: str
     path: Path
     tables: list[str]
@@ -61,7 +47,7 @@ class FSSchema(BaseModel):
     def ref(self) -> Schema:
         return Schema(
             id=self.id,
-            repo_id=str(self.path.parent),
+            repo_id=self.repo_id,
             name=self.name,
             value=str(self.path.absolute()),
             tables=self.tables,
@@ -82,8 +68,8 @@ class FSRepository(Repository):
     """
 
     ID_FMT = {
-        "schema": "{repo_name}-{schema_name}",
-        "table": "{repo_name}-{schema_name}-{table_name}",
+        "schema": "{repo_id}-{schema_name}",
+        "table": "{schema_id}-{table_name}",
     }
 
     def __init__(self, id: str, name: str, basepath: str):
@@ -139,22 +125,29 @@ class FSRepository(Repository):
         """List schemas (sub directories within basepath)"""
         subdirectories = self.ls("", only_directories=True)
         return [
-            self.ID_FMT["schema"].format(repo_name=self.name, schema_name=d) for d in subdirectories
+            self.ID_FMT["schema"].format(repo_id=self.id, schema_name=d) for d in subdirectories
         ]
 
     def _schema(self, schema_id: str) -> FSSchema:
         """List schemas (sub directories within basepath)"""
         parsed = self.parse_id(schema_id, "schema")
 
-        repo_name = parsed["repo_name"]
+        repo_id = parsed["repo_id"]
         schema_name = parsed["schema_name"]
         schema_path = self._basepath / schema_name
 
-        if repo_name != self.name:
+        if repo_id != self.id:
             raise FSRepositoryError("Trying to get schema that don't belong in this repository")
 
         tables = self.tables(schema_id)
-        return FSSchema(name=schema_name, id=schema_id, path=schema_path, tables=tables)
+        return FSSchema(
+            name=schema_name,
+            id=schema_id,
+            repo_id=self.id,
+            schema_id=schema_id,
+            path=schema_path,
+            tables=tables,
+        )
 
     def schema(self, schema_id: str) -> Schema:
         return self._schema(schema_id).ref
@@ -162,7 +155,7 @@ class FSRepository(Repository):
     def _tables(self, schema_id: str) -> list[str]:
         parsed = self.parse_id(schema_id, "schema")
         tables = self.ls(parsed["schema_name"])
-        return [self.ID_FMT["table"].format(table_name=t, **parsed) for t in tables]
+        return [self.ID_FMT["table"].format(table_name=t, schema_id=schema_id) for t in tables]
 
     def tables(self, schema_id: str = "") -> list[str]:
         if schema_id:
@@ -176,14 +169,12 @@ class FSRepository(Repository):
     def _table(self, table_id: str) -> FSTable:
         """Get infos on the table"""
         parsed = self.parse_id(table_id, "table")
-        if parsed["repo_name"] != self.name:
-            raise FSRepositoryError("Trying to get table that don't belong in this repository")
+        schema = self._schema(parsed["schema_id"])
 
-        schema_path = self._basepath / parsed["schema_name"]
-        if not schema_path.exists():
-            raise FSRepositoryError(f"The schema {parsed['schema_name']} does not exists.")
+        if not schema.path.exists():
+            raise FSRepositoryError(f"The schema {schema.id} does not exists.")
 
-        table_subpath = f"{parsed['schema_name']}/{parsed['table_name']}"
+        table_subpath = f"{schema.name}/{parsed['table_name']}"
         table_path = self._basepath / table_subpath
 
         is_partitionned = table_path.is_dir()
@@ -195,6 +186,8 @@ class FSRepository(Repository):
         return FSTable(
             name=parsed["table_name"],
             id=table_id,
+            repo_id=self.id,
+            schema_id=schema.id,
             path=table_path,
             is_partitionned=is_partitionned,
             partitions=partitions,
diff --git a/tests/storage/test_fs_repository.py b/tests/storage/test_fs_repository.py
index a40aa95..434e30e 100644
--- a/tests/storage/test_fs_repository.py
+++ b/tests/storage/test_fs_repository.py
@@ -45,50 +45,50 @@ def test_init(location):
 
 @pytest.fixture
 def repository(location) -> FSRepository:
-    return FSRepository("example", "example", location)
+    return FSRepository("repo_id", "example", location)
 
 
 def test_list_schemas(repository):
-    assert repository.schemas() == ["example-schema"]
+    assert repository.schemas() == ["repo_id-schema"]
 
 
 def test_describe_schema(location, repository):
-    schema = repository.schema("example-schema")
+    schema = repository.schema("repo_id-schema")
     assert schema.name == "schema"
-    assert schema.id == "example-schema"
-    assert schema.repo_id == str(location)
+    assert schema.id == "repo_id-schema"
+    assert schema.repo_id == "repo_id"
     assert schema.value == str(location / "schema")
     assert schema.tables == [
-        "example-schema-username",
-        "example-schema-recovery",
-        "example-schema-salary",
+        "repo_id-schema-username",
+        "repo_id-schema-recovery",
+        "repo_id-schema-salary",
     ]
 
 
 def test_list_tables_schema(repository):
-    assert repository.schema("example-schema").tables == [
-        "example-schema-username",
-        "example-schema-recovery",
-        "example-schema-salary",
+    assert repository.schema("repo_id-schema").tables == [
+        "repo_id-schema-username",
+        "repo_id-schema-recovery",
+        "repo_id-schema-salary",
     ]
-    assert repository.tables("example-schema") == [
-        "example-schema-username",
-        "example-schema-recovery",
-        "example-schema-salary",
+    assert repository.tables("repo_id-schema") == [
+        "repo_id-schema-username",
+        "repo_id-schema-recovery",
+        "repo_id-schema-salary",
     ]
     assert repository.tables() == [
-        "example-schema-username",
-        "example-schema-recovery",
-        "example-schema-salary",
+        "repo_id-schema-username",
+        "repo_id-schema-recovery",
+        "repo_id-schema-salary",
     ]
 
 
 def test_describe_table(location, repository):
-    table = repository.table("example-schema-username")
+    table = repository.table("repo_id-schema-username")
 
-    assert table.id == "example-schema-username"
-    assert table.repo_id == str(location)
-    assert table.schema_id == str(location / "schema")
+    assert table.id == "repo_id-schema-username"
+    assert table.repo_id == "repo_id"
+    assert table.schema_id == "repo_id-schema"
     assert table.name == "username"
     assert table.value == str(location / "schema" / "username")
     assert table.partitions == ["username.csv"]
@@ -96,11 +96,11 @@ def test_describe_table(location, repository):
 
 
 def test_describe_table_with_partitions(location, repository):
-    table = repository.table("example-schema-recovery")
+    table = repository.table("repo_id-schema-recovery")
 
-    assert table.id == "example-schema-recovery"
-    assert table.repo_id == str(location)
-    assert table.schema_id == str(location / "schema")
+    assert table.id == "repo_id-schema-recovery"
+    assert table.repo_id == "repo_id"
+    assert table.schema_id == "repo_id-schema"
     assert table.name == "recovery"
     assert table.value == str(location / "schema" / "recovery")
     assert table.partitions == [