Feat: create datacatalogue with fs_datacatalogue
This commit is contained in:
parent
aa1ead5435
commit
88795fdad3
0
plesna/datastore/__init__.py
Normal file
0
plesna/datastore/__init__.py
Normal file
1
plesna/datastore/catalogue.py
Normal file
1
plesna/datastore/catalogue.py
Normal file
@ -0,0 +1 @@
|
|||||||
|
class
|
21
plesna/datastore/datacatalogue.py
Normal file
21
plesna/datastore/datacatalogue.py
Normal file
@ -0,0 +1,21 @@
|
|||||||
|
import abc
|
||||||
|
|
||||||
|
|
||||||
|
class DataCatalogue:
|
||||||
|
def __init__(self):
|
||||||
|
pass
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def schemas(self) -> dict[str:str]:
|
||||||
|
"""List schemas"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def tables(self, schema) -> dict[str:str]:
|
||||||
|
"""List table in schema"""
|
||||||
|
raise NotImplementedError
|
||||||
|
|
||||||
|
@abc.abstractmethod
|
||||||
|
def infos(self, table: str, schema: str) -> dict[str, str]:
|
||||||
|
"""Get infos about the table"""
|
||||||
|
raise NotImplementedError
|
3
plesna/datastore/datastore.py
Normal file
3
plesna/datastore/datastore.py
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
class DataStore:
|
||||||
|
def __init__(self, name):
|
||||||
|
self._name
|
86
plesna/datastore/fs_datacatalogue.py
Normal file
86
plesna/datastore/fs_datacatalogue.py
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from pydantic import BaseModel, computed_field
|
||||||
|
|
||||||
|
from .datacatalogue import DataCatalogue
|
||||||
|
|
||||||
|
|
||||||
|
class Schema(BaseModel):
|
||||||
|
path: Path
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def id(self) -> str:
|
||||||
|
return str(self.path)
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def value(self) -> str:
|
||||||
|
return str(self.path)
|
||||||
|
|
||||||
|
|
||||||
|
class Table(BaseModel):
|
||||||
|
path: Path
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def id(self) -> str:
|
||||||
|
return str(self.path)
|
||||||
|
|
||||||
|
@computed_field
|
||||||
|
@property
|
||||||
|
def value(self) -> str:
|
||||||
|
return str(self.path)
|
||||||
|
|
||||||
|
|
||||||
|
class FSDataCatalogue(DataCatalogue):
|
||||||
|
"""DataCatalogue based on files tree structure"""
|
||||||
|
|
||||||
|
def __init__(self, name: str, basepath: str = "."):
|
||||||
|
self._basepath = Path(basepath)
|
||||||
|
self.name = name
|
||||||
|
|
||||||
|
assert self._basepath.exists()
|
||||||
|
|
||||||
|
def ls(
|
||||||
|
self, dir="", only_files=False, only_directories=False, recursive=False
|
||||||
|
) -> list[str]:
|
||||||
|
dirpath = self._basepath / dir
|
||||||
|
|
||||||
|
if only_files:
|
||||||
|
return [
|
||||||
|
str(f.relative_to(dirpath))
|
||||||
|
for f in dirpath.iterdir()
|
||||||
|
if not f.is_dir() and not str(f).startswith(".")
|
||||||
|
]
|
||||||
|
|
||||||
|
if only_directories:
|
||||||
|
if recursive:
|
||||||
|
return [
|
||||||
|
str(f[0].relative_to(dirpath))
|
||||||
|
for f in dirpath.walk()
|
||||||
|
if not str(f).startswith(".")
|
||||||
|
]
|
||||||
|
|
||||||
|
return [
|
||||||
|
str(f.relative_to(dirpath))
|
||||||
|
for f in dirpath.iterdir()
|
||||||
|
if f.is_dir() and not str(f).startswith(".")
|
||||||
|
]
|
||||||
|
|
||||||
|
return [
|
||||||
|
str(f.relative_to(dirpath))
|
||||||
|
for f in dirpath.iterdir()
|
||||||
|
if not str(f).startswith(".")
|
||||||
|
]
|
||||||
|
|
||||||
|
def schemas(self) -> dict[str, Schema]:
|
||||||
|
"""List schemas (sub directories within basepath)"""
|
||||||
|
subdirectories = self.ls("", only_directories=True, recursive=True)
|
||||||
|
return {str(path): Schema(path=path) for path in subdirectories}
|
||||||
|
|
||||||
|
def tables(self, schema_id=".") -> dict[str, Table]:
|
||||||
|
"""List table in schema (which are files in the directory)"""
|
||||||
|
schema_path = schema_id
|
||||||
|
tables = [Table(path=path) for path in self.ls(schema_path, only_files=True)]
|
||||||
|
return {table.id: table for table in tables}
|
0
tests/datastore/__init__.py
Normal file
0
tests/datastore/__init__.py
Normal file
BIN
tests/datastore/fs_files/salary.pdf
Normal file
BIN
tests/datastore/fs_files/salary.pdf
Normal file
Binary file not shown.
BIN
tests/datastore/fs_files/username-password-recovery-code.xls
Normal file
BIN
tests/datastore/fs_files/username-password-recovery-code.xls
Normal file
Binary file not shown.
BIN
tests/datastore/fs_files/username-password-recovery-code.xlsx
Normal file
BIN
tests/datastore/fs_files/username-password-recovery-code.xlsx
Normal file
Binary file not shown.
7
tests/datastore/fs_files/username.csv
Normal file
7
tests/datastore/fs_files/username.csv
Normal file
@ -0,0 +1,7 @@
|
|||||||
|
Username;Identifier;First name;Last name
|
||||||
|
booker12;9012;Rachel;Booker
|
||||||
|
grey07;2070;Laura;Grey
|
||||||
|
johnson81;4081;Craig;Johnson
|
||||||
|
jenkins46;9346;Mary;Jenkins
|
||||||
|
smith79;5079;Jamie;Smith
|
||||||
|
|
|
72
tests/datastore/test_fs_datacatalogue.py
Normal file
72
tests/datastore/test_fs_datacatalogue.py
Normal file
@ -0,0 +1,72 @@
|
|||||||
|
import shutil
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from plesna.datastore.fs_datacatalogue import FSDataCatalogue
|
||||||
|
|
||||||
|
FIXTURE_DIR = Path(__file__).parent / Path("./fs_files/")
|
||||||
|
|
||||||
|
|
||||||
|
@pytest.fixture
|
||||||
|
def location(tmp_path):
|
||||||
|
loc = tmp_path
|
||||||
|
username_loc = loc / "username"
|
||||||
|
username_loc.mkdir()
|
||||||
|
salary_loc = loc / "salary"
|
||||||
|
salary_loc.mkdir()
|
||||||
|
example_src = FIXTURE_DIR
|
||||||
|
assert example_src.exists()
|
||||||
|
|
||||||
|
for f in example_src.glob("*"):
|
||||||
|
if "username" in str(f):
|
||||||
|
shutil.copy(f, username_loc)
|
||||||
|
else:
|
||||||
|
shutil.copy(f, salary_loc)
|
||||||
|
|
||||||
|
return loc
|
||||||
|
|
||||||
|
|
||||||
|
def test_init(location):
|
||||||
|
repo = FSDataCatalogue("example", location)
|
||||||
|
assert repo.ls() == [
|
||||||
|
"username",
|
||||||
|
"salary",
|
||||||
|
]
|
||||||
|
|
||||||
|
assert repo.ls(recursive=True) == [
|
||||||
|
"username",
|
||||||
|
"salary",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_schema(location):
|
||||||
|
repo = FSDataCatalogue("example", location)
|
||||||
|
assert {id: s.model_dump()["id"] for id, s in repo.schemas().items()} == {
|
||||||
|
".": ".",
|
||||||
|
"username": "username",
|
||||||
|
"salary": "salary",
|
||||||
|
}
|
||||||
|
assert {id: s.model_dump()["value"] for id, s in repo.schemas().items()} == {
|
||||||
|
".": ".",
|
||||||
|
"username": "username",
|
||||||
|
"salary": "salary",
|
||||||
|
}
|
||||||
|
assert {id: s.model_dump()["path"] for id, s in repo.schemas().items()} == {
|
||||||
|
".": Path("."),
|
||||||
|
"username": Path("username"),
|
||||||
|
"salary": Path("salary"),
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def test_list_tables(location):
|
||||||
|
repo = FSDataCatalogue("example", location)
|
||||||
|
assert repo.tables() == {}
|
||||||
|
assert {id: t.model_dump()["value"] for id,t in repo.tables("username").items()} == {
|
||||||
|
"username.csv": "username.csv",
|
||||||
|
"username-password-recovery-code.xlsx": "username-password-recovery-code.xlsx",
|
||||||
|
"username-password-recovery-code.xls": "username-password-recovery-code.xls",
|
||||||
|
}
|
||||||
|
assert {id: t.model_dump()["value"] for id,t in repo.tables("salary").items()} == {
|
||||||
|
"salary.pdf": "salary.pdf",
|
||||||
|
}
|
Loading…
Reference in New Issue
Block a user