Compare commits

5 Commits

15 changed files with 352 additions and 73 deletions

View File

View File

@@ -0,0 +1,70 @@
from collections.abc import Callable
from datetime import datetime
import pandas as pd
from pydantic import BaseModel
from ..repository.repository import AbstractRepository
class Schema(BaseModel):
repository: str
schema: str
class Table(BaseModel):
repository: str
schema: str
table: str
class Flux(BaseModel):
sources: list[Table]
destinations: dict[str, Table]
transformation: Callable[[list[pd.DataFrame]], dict[str, pd.DataFrame]]
class State(BaseModel):
statuses: dict[str, dict]
qty_out: int
failed_lines: list[str]
start: datetime
end: datetime
Repositories = dict[str, AbstractRepository]
def open_source(repositories: Repositories, source: Table) -> pd.DataFrame:
return repositories[source.repository].read(source.table, source.schema)
def write_source(
content: pd.DataFrame, repositories: Repositories, destination: Table
) -> str:
return repositories[destination.repository].write(
content, destination.table, destination.schema
)
def consume_flux(flux: Flux, repositories: dict[str, AbstractRepository]) -> State:
start = datetime.now()
src_dfs = [open_source(repositories, source) for source in flux.sources]
built_dfs = flux.transformation(src_dfs)
statuses = {
dest: write_source(df, repositories, flux.destinations[dest])
for dest, df in built_dfs.items()
}
end = datetime.now()
qty_out = 0
failed_lines = []
return State(
statuses=statuses,
qty_out=qty_out,
failed_lines=failed_lines,
start=start,
end=end,
)

View File

@@ -1,35 +0,0 @@
from .schema import AbstractSchema
from pathlib import Path
class FSSchema(AbstractSchema):
def __init__(self, basepath, metadata_engine=None):
self.basepath = basepath
self._metadata_engine = metadata_engine
def ls(self, dir, only_files=True):
dirpath = Path(dir)
if only_files:
return [f for f in dirpath.iterdir() if f.is_dir()]
return [f for f in dirpath.iterdir()]
def tables(self, dir, only_files=True):
dirpath = Path(dir)
if only_files:
return [f for f in dirpath.iterdir() if f.is_dir()]
return [f for f in dirpath.iterdir()]
def info(self, path):
path = Path(path)
pass
def read(self, path):
path = Path(path)
pass
def write(self, path, content):
path = Path(path)
pass
def delete(self, path):
path = Path(path)
pass

View File

@@ -4,61 +4,83 @@ import pandas as pd
from .repository import AbstractRepository
ACCEPTABLE_EXTENTIONS = {
"csv": [".csv"],
"excel": [".xls", ".xlsx"],
}
class FSRepository(AbstractRepository):
def __init__(self, name, basepath, metadata_engine=None):
self.name = name
self.basepath = Path(basepath)
assert self.basepath.exists()
self._metadata_engine = metadata_engine
def ls(self, dir, only_files=False, only_directories=False, recursive=False) -> list[str]:
dirpath = Path(dir)
def ls(
self, dir="", only_files=False, only_directories=False, recursive=False
) -> list[str]:
dirpath = self.basepath / dir
if only_files:
return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if not f.is_dir()]
return [
str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if not f.is_dir() and not str(f).startswith(".")
]
if only_directories:
if recursive:
return [str(f[0].relative_to(dirpath)) for f in dirpath.walk()]
return [
str(f[0].relative_to(dirpath))
for f in dirpath.walk()
if not str(f).startswith(".")
]
return [str(f.relative_to(dirpath)) for f in dirpath.iterdir() if f.is_dir()]
return [
str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if f.is_dir() and not str(f).startswith(".")
]
return [str(f.relative_to(dirpath)) for f in dirpath.iterdir()]
return [
str(f.relative_to(dirpath))
for f in dirpath.iterdir()
if not str(f).startswith(".")
]
def schemas(self, recursive=True) -> list[str]:
dirpath = self.basepath
return self.ls(dirpath, only_directories=True, recursive=True)
return self.ls("", only_directories=True, recursive=True)
def tables(self, schema:str) -> list[str]:
dirpath = self.basepath / schema
return self.ls(dirpath, only_files=True)
def tables(self, schema: str = ".") -> list[str]:
return self.ls(schema, only_files=True)
def build_table_path(self, table:str, schema:str):
def build_table_path(self, table: str, schema: str):
table_path = self.basepath
if schema == '.':
if schema == ".":
return table_path / table
return table_path / schema / table
def info(self, table:str, schema:str='.'):
def infos(self, table: str, schema: str = "."):
table_path = self.build_table_path(table, schema)
pass
def read(self, table:str, schema:str='.', read_options={}):
def read(self, table: str, schema: str = ".", **read_options):
table_path = self.build_table_path(table, schema)
assert table_path.exists()
extension = table_path.suffix
if extension == '.csv':
if extension in ACCEPTABLE_EXTENTIONS["csv"]:
return pd.read_csv(table_path, **read_options)
if extension == '.xlsx':
return pd.read_excel(table_path, **read_options)
if extension in ACCEPTABLE_EXTENTIONS["excel"]:
return pd.read_excel(table_path, engine = "openpyxl", **read_options)
raise ValueError("Can't open the table")
raise ValueError("Bad extention. Can't open the table.")
def write(self, table:str, content, schema:str='.'):
def write(self, content, table: str, schema: str = "."):
table_path = self.build_table_path(table, schema)
pass
def delete(self, table:str, schema:str='.'):
def delete_table(self, table: str, schema: str = "."):
table_path = self.build_table_path(table, schema)
pass

View File

@@ -1,4 +1,5 @@
import abc
from .metadata import AbstractMetadataEngine
@@ -6,31 +7,31 @@ class AbstractRepository(abc.ABC):
metadata_engine = AbstractMetadataEngine
@abc.abstractmethod
def schemas():
""" List schemas """
def schemas(self) -> list[str]:
"""List schemas"""
raise NotImplementedError
@abc.abstractmethod
def tables(schema):
""" List table in schema"""
def tables(self, schema) -> list[str]:
"""List table in schema"""
raise NotImplementedError
@abc.abstractmethod
def info(self, path):
""" Get infos about a file"""
def infos(self, table: str, schema: str) -> dict[str, str]:
"""Get infos about the table"""
raise NotImplementedError
@abc.abstractmethod
def read(self, path):
""" Get content of a file"""
def read(self, table: str, schema: str):
"""Get content of the table"""
raise NotImplementedError
@abc.abstractmethod
def write(self, path, content):
""" Write content into the file"""
def write(self, content, table: str, schema: str):
"""Write content into the table"""
raise NotImplementedError
@abc.abstractmethod
def delete(self, path):
""" Delete the file """
def delete_table(self, table: str, schema: str):
"""Delete the table"""
raise NotImplementedError

View File

@@ -1,7 +1,6 @@
jupyter==1.0.0
pandas==1.5.0
pdf-oralia==0.3.11
pydantic==2.6.1
pandas==2.2.2
pydantic==2.8.2
click==8.1.7
dlt[duckdb]>=0.4.3a0
openpyxl>=3.0.0
openpyxl==3.1.5
xlrd==2.0.1

0
tests/__init__.py Normal file
View File

View File

Binary file not shown.

View File

@@ -0,0 +1,7 @@
Username;Identifier;First name;Last name
booker12;9012;Rachel;Booker
grey07;2070;Laura;Grey
johnson81;4081;Craig;Johnson
jenkins46;9346;Mary;Jenkins
smith79;5079;Jamie;Smith
1 Username Identifier First name Last name
2 booker12 9012 Rachel Booker
3 grey07 2070 Laura Grey
4 johnson81 4081 Craig Johnson
5 jenkins46 9346 Mary Jenkins
6 smith79 5079 Jamie Smith

View File

@@ -0,0 +1,84 @@
import shutil
from pathlib import Path
import pytest
from pandas import pandas
from dashboard.libs.repository.fs_repository import FSRepository
EXAMPLE_DIR = "./tests/repository/fs_examples/"
@pytest.fixture
def location(tmp_path):
loc = tmp_path
username_loc = loc / "username"
username_loc.mkdir()
salary_loc = loc / "salary"
salary_loc.mkdir()
example_src = Path(EXAMPLE_DIR)
for f in example_src.glob("*"):
if "username" in str(f):
shutil.copy(f, username_loc)
else:
shutil.copy(f, salary_loc)
return loc
def test_init(location):
repo = FSRepository("example", location)
assert repo.ls() == [
"username",
"salary",
]
assert repo.schemas() == [
".",
"username",
"salary",
]
assert repo.tables() == []
assert repo.tables("username") == [
"username.csv",
"username-password-recovery-code.xlsx",
"username-password-recovery-code.xls",
]
assert repo.tables("salary") == ["salary.pdf"]
def test_read_csv(location):
repo = FSRepository("example", location)
username = repo.read("username.csv", "username", delimiter=";")
assert list(username.columns) == [
"Username",
"Identifier",
"First name",
"Last name",
]
assert len(username.index) == 5
def test_fake_read_xlsx(location):
repo = FSRepository("example", location)
df = pandas.read_excel(
location / "username" / "username-password-recovery-code.xls"
)
print(df)
def test_read_xlsx(location):
repo = FSRepository("example", location)
username = repo.read("username-password-recovery-code.xls", "username")
assert list(username.columns) == [
"Username",
"Identifier",
"One-time password",
"Recovery code",
"First name",
"Last name",
"Department",
"Location",
]
assert len(username.index) == 5

131
tests/test_flux.py Normal file
View File

@@ -0,0 +1,131 @@
import pandas as pd
import pytest
from dashboard.libs.flux.flux import Flux, consume_flux
from dashboard.libs.repository.repository import AbstractRepository
FakeTable = pd.DataFrame
FakeSchema = dict[str, pd.DataFrame]
FakeSchemas = dict[str, FakeSchema]
class FakeRepository(AbstractRepository):
def __init__(self, schemas: FakeSchemas):
self._schemas = {}
for schema_name, tables in schemas.items():
schema = {}
for table, df in tables.items():
schema[table] = {
"df": df,
"metadata": {
"status": "new",
"qty_read": 0,
"qty_write": 0,
},
}
self._schemas[schema_name] = schema
def schemas(self):
"""List schemas"""
return list(self._schemas.keys())
def tables(self, schema):
"""List table's name in schema"""
return list(self._schemas[schema].keys())
def infos(self, table: str, schema: str) -> dict[str, str]:
"""Get infos about the table"""
return self._schemas[schema][table]["metadata"]
def read(self, table, schema) -> pd.DataFrame:
"""Get content of the table"""
self._schemas[schema][table]["metadata"]["qty_read"] += 1
return self._schemas[schema][table]["df"]
def write(self, content, table, schema) -> dict[str, str]:
"""Write content into the table"""
try:
self._schemas[schema][table]["df"] = content
except KeyError:
self._schemas[schema][table] = {
"df": content,
"metadata": {
"status": "new",
"qty_read": 0,
"qty_write": 0,
},
}
self._schemas[schema][table]["metadata"]["status"] = "modified"
self._schemas[schema][table]["metadata"]["qty_write"] += 1
return self.infos(table, schema)
def delete_table(self, table, schema):
"""Delete the table"""
raise NotImplementedError
def test_fakerepository():
fakerepository = FakeRepository(
{
"foo": {
"table1": pd.DataFrame({"A": []}),
"table2": pd.DataFrame({"B": []}),
},
"bar": {
"table1": pd.DataFrame({"C": []}),
"table2": pd.DataFrame({"D": []}),
},
}
)
assert fakerepository.schemas() == ["foo", "bar"]
assert fakerepository.tables("foo") == ["table1", "table2"]
assert fakerepository.infos("table1", "foo") == {
"status": "new",
"qty_read": 0,
"qty_write": 0,
}
assert fakerepository.read("table1", "foo").equals(pd.DataFrame({"A": []}))
assert fakerepository.infos("table1", "foo") == {
"status": "new",
"qty_read": 1,
"qty_write": 0,
}
df = pd.DataFrame({"A": [1, 2]})
assert fakerepository.write(df, "table1", "foo") == {
"status": "modified",
"qty_read": 1,
"qty_write": 1,
}
def test_consume_flux():
source_repository = FakeRepository(
{
"source": {
"table1": pd.DataFrame({"A": [1, 2, 3]}),
},
}
)
dest_repository = FakeRepository(
{
"destination": {},
}
)
repositories = {
"source": source_repository,
"dest": dest_repository,
}
transformation = lambda dfs: {"dest": dfs[0] * 2}
flux = Flux(
sources=[{"repository": "source", "schema": "source", "table": "table1"}],
destinations={
"dest": {"repository": "dest", "schema": "destination", "table": "table1"}
},
transformation=transformation,
)
state = consume_flux(flux, repositories)
assert state.statuses["dest"] == {'status': 'modified', 'qty_read': 0, 'qty_write': 1}
assert dest_repository.read("table1", "destination").equals(pd.DataFrame({"A": [2, 4, 6]}))