refact: repo id are not based on path but on id

This commit is contained in:
Bertrand Benjamin 2025-01-05 15:13:38 +01:00
parent ae61fd3c12
commit 8a43a93cda
2 changed files with 53 additions and 60 deletions

View File

@ -7,25 +7,10 @@ from plesna.models.storage import Partition, Schema, Table
from plesna.storage.repository.repository import Repository from plesna.storage.repository.repository import Repository
class FSPartition(BaseModel):
name: str
path: Path
@computed_field
@property
def ref(self) -> Partition:
return Partition(
id=str(self.path),
repo_id=str(self.path.parent.parent.parent),
schema_id=str(self.path.parent.parent),
table_id=str(self.path.parent),
name=self.name,
value=str(self.path.absolute()),
)
class FSTable(BaseModel): class FSTable(BaseModel):
name: str name: str
repo_id: str
schema_id: str
id: str id: str
path: Path path: Path
is_partitionned: bool is_partitionned: bool
@ -41,8 +26,8 @@ class FSTable(BaseModel):
return Table( return Table(
id=self.id, id=self.id,
repo_id=str(self.path.parent.parent), repo_id=self.repo_id,
schema_id=str(self.path.parent), schema_id=self.schema_id,
name=self.name, name=self.name,
value=str(self.path.absolute()), value=str(self.path.absolute()),
partitions=self.partitions, partitions=self.partitions,
@ -52,6 +37,7 @@ class FSTable(BaseModel):
class FSSchema(BaseModel): class FSSchema(BaseModel):
name: str name: str
repo_id: str
id: str id: str
path: Path path: Path
tables: list[str] tables: list[str]
@ -61,7 +47,7 @@ class FSSchema(BaseModel):
def ref(self) -> Schema: def ref(self) -> Schema:
return Schema( return Schema(
id=self.id, id=self.id,
repo_id=str(self.path.parent), repo_id=self.repo_id,
name=self.name, name=self.name,
value=str(self.path.absolute()), value=str(self.path.absolute()),
tables=self.tables, tables=self.tables,
@ -82,8 +68,8 @@ class FSRepository(Repository):
""" """
ID_FMT = { ID_FMT = {
"schema": "{repo_name}-{schema_name}", "schema": "{repo_id}-{schema_name}",
"table": "{repo_name}-{schema_name}-{table_name}", "table": "{schema_id}-{table_name}",
} }
def __init__(self, id: str, name: str, basepath: str): def __init__(self, id: str, name: str, basepath: str):
@ -139,22 +125,29 @@ class FSRepository(Repository):
"""List schemas (sub directories within basepath)""" """List schemas (sub directories within basepath)"""
subdirectories = self.ls("", only_directories=True) subdirectories = self.ls("", only_directories=True)
return [ return [
self.ID_FMT["schema"].format(repo_name=self.name, schema_name=d) for d in subdirectories self.ID_FMT["schema"].format(repo_id=self.id, schema_name=d) for d in subdirectories
] ]
def _schema(self, schema_id: str) -> FSSchema: def _schema(self, schema_id: str) -> FSSchema:
"""List schemas (sub directories within basepath)""" """List schemas (sub directories within basepath)"""
parsed = self.parse_id(schema_id, "schema") parsed = self.parse_id(schema_id, "schema")
repo_name = parsed["repo_name"] repo_id = parsed["repo_id"]
schema_name = parsed["schema_name"] schema_name = parsed["schema_name"]
schema_path = self._basepath / schema_name schema_path = self._basepath / schema_name
if repo_name != self.name: if repo_id != self.id:
raise FSRepositoryError("Trying to get schema that don't belong in this repository") raise FSRepositoryError("Trying to get schema that don't belong in this repository")
tables = self.tables(schema_id) tables = self.tables(schema_id)
return FSSchema(name=schema_name, id=schema_id, path=schema_path, tables=tables) return FSSchema(
name=schema_name,
id=schema_id,
repo_id=self.id,
schema_id=schema_id,
path=schema_path,
tables=tables,
)
def schema(self, schema_id: str) -> Schema: def schema(self, schema_id: str) -> Schema:
return self._schema(schema_id).ref return self._schema(schema_id).ref
@ -162,7 +155,7 @@ class FSRepository(Repository):
def _tables(self, schema_id: str) -> list[str]: def _tables(self, schema_id: str) -> list[str]:
parsed = self.parse_id(schema_id, "schema") parsed = self.parse_id(schema_id, "schema")
tables = self.ls(parsed["schema_name"]) tables = self.ls(parsed["schema_name"])
return [self.ID_FMT["table"].format(table_name=t, **parsed) for t in tables] return [self.ID_FMT["table"].format(table_name=t, schema_id=schema_id) for t in tables]
def tables(self, schema_id: str = "") -> list[str]: def tables(self, schema_id: str = "") -> list[str]:
if schema_id: if schema_id:
@ -176,14 +169,12 @@ class FSRepository(Repository):
def _table(self, table_id: str) -> FSTable: def _table(self, table_id: str) -> FSTable:
"""Get infos on the table""" """Get infos on the table"""
parsed = self.parse_id(table_id, "table") parsed = self.parse_id(table_id, "table")
if parsed["repo_name"] != self.name: schema = self._schema(parsed["schema_id"])
raise FSRepositoryError("Trying to get table that don't belong in this repository")
schema_path = self._basepath / parsed["schema_name"] if not schema.path.exists():
if not schema_path.exists(): raise FSRepositoryError(f"The schema {schema.id} does not exists.")
raise FSRepositoryError(f"The schema {parsed['schema_name']} does not exists.")
table_subpath = f"{parsed['schema_name']}/{parsed['table_name']}" table_subpath = f"{schema.name}/{parsed['table_name']}"
table_path = self._basepath / table_subpath table_path = self._basepath / table_subpath
is_partitionned = table_path.is_dir() is_partitionned = table_path.is_dir()
@ -195,6 +186,8 @@ class FSRepository(Repository):
return FSTable( return FSTable(
name=parsed["table_name"], name=parsed["table_name"],
id=table_id, id=table_id,
repo_id=self.id,
schema_id=schema.id,
path=table_path, path=table_path,
is_partitionned=is_partitionned, is_partitionned=is_partitionned,
partitions=partitions, partitions=partitions,

View File

@ -45,50 +45,50 @@ def test_init(location):
@pytest.fixture @pytest.fixture
def repository(location) -> FSRepository: def repository(location) -> FSRepository:
return FSRepository("example", "example", location) return FSRepository("repo_id", "example", location)
def test_list_schemas(repository): def test_list_schemas(repository):
assert repository.schemas() == ["example-schema"] assert repository.schemas() == ["repo_id-schema"]
def test_describe_schema(location, repository): def test_describe_schema(location, repository):
schema = repository.schema("example-schema") schema = repository.schema("repo_id-schema")
assert schema.name == "schema" assert schema.name == "schema"
assert schema.id == "example-schema" assert schema.id == "repo_id-schema"
assert schema.repo_id == str(location) assert schema.repo_id == "repo_id"
assert schema.value == str(location / "schema") assert schema.value == str(location / "schema")
assert schema.tables == [ assert schema.tables == [
"example-schema-username", "repo_id-schema-username",
"example-schema-recovery", "repo_id-schema-recovery",
"example-schema-salary", "repo_id-schema-salary",
] ]
def test_list_tables_schema(repository): def test_list_tables_schema(repository):
assert repository.schema("example-schema").tables == [ assert repository.schema("repo_id-schema").tables == [
"example-schema-username", "repo_id-schema-username",
"example-schema-recovery", "repo_id-schema-recovery",
"example-schema-salary", "repo_id-schema-salary",
] ]
assert repository.tables("example-schema") == [ assert repository.tables("repo_id-schema") == [
"example-schema-username", "repo_id-schema-username",
"example-schema-recovery", "repo_id-schema-recovery",
"example-schema-salary", "repo_id-schema-salary",
] ]
assert repository.tables() == [ assert repository.tables() == [
"example-schema-username", "repo_id-schema-username",
"example-schema-recovery", "repo_id-schema-recovery",
"example-schema-salary", "repo_id-schema-salary",
] ]
def test_describe_table(location, repository): def test_describe_table(location, repository):
table = repository.table("example-schema-username") table = repository.table("repo_id-schema-username")
assert table.id == "example-schema-username" assert table.id == "repo_id-schema-username"
assert table.repo_id == str(location) assert table.repo_id == "repo_id"
assert table.schema_id == str(location / "schema") assert table.schema_id == "repo_id-schema"
assert table.name == "username" assert table.name == "username"
assert table.value == str(location / "schema" / "username") assert table.value == str(location / "schema" / "username")
assert table.partitions == ["username.csv"] assert table.partitions == ["username.csv"]
@ -96,11 +96,11 @@ def test_describe_table(location, repository):
def test_describe_table_with_partitions(location, repository): def test_describe_table_with_partitions(location, repository):
table = repository.table("example-schema-recovery") table = repository.table("repo_id-schema-recovery")
assert table.id == "example-schema-recovery" assert table.id == "repo_id-schema-recovery"
assert table.repo_id == str(location) assert table.repo_id == "repo_id"
assert table.schema_id == str(location / "schema") assert table.schema_id == "repo_id-schema"
assert table.name == "recovery" assert table.name == "recovery"
assert table.value == str(location / "schema" / "recovery") assert table.value == str(location / "schema" / "recovery")
assert table.partitions == [ assert table.partitions == [