Feat: execute flux on dataplatform

Feat: add execute_flux
Feat: add flux in dataplatform
2025-01-04 21:33:05 +01:00 · 2025-01-04 15:30:32 +01:00 · 2025-01-04 13:51:24 +01:00
6 changed files with 113 additions and 97 deletions
--- a/plesna/dataplatform.py
+++ b/plesna/dataplatform.py
@ -1,4 +1,6 @@
+from plesna.compute.consume_flux import consume_flux
 from plesna.graph.graph_set import GraphSet
+from plesna.models.flux import Flux, FluxMetaData
 from plesna.storage.repository.repository import Repository


@ -10,14 +12,15 @@ class DataPlateform:
    def __init__(self):
        self._graphset = GraphSet()
        self._metadata_engine = ""
-        self._transformations = {}
+        self._fluxes = {}
        self._repositories = {}

-    def add_repository(self, name: str, repository: Repository):
+    def add_repository(self, name: str, repository: Repository) -> str:
        if name in self._repositories:
            raise DataPlateformError("The repository {name} already exists")

        self._repositories[name] = repository
+        return name

    @property
    def repositories(self) -> list[str]:
@ -25,3 +28,22 @@ class DataPlateform:

    def repository(self, name: str) -> Repository:
        return self._repositories[name]
+
+    def add_flux(self, name: str, flux: Flux) -> str:
+        if name in self._fluxes:
+            raise DataPlateformError("The flux {name} already exists")
+
+        self._fluxes[name] = flux
+        return name
+
+    @property
+    def fluxes(self) -> list[str]:
+        return list(self._fluxes)
+
+    def flux(self, name: str) -> Flux:
+        return self._fluxes[name]
+
+    def execute_flux(self, name: str) -> FluxMetaData:
+        if name not in self._fluxes:
+            raise DataPlateformError("The flux {name} is not registered")
+        return consume_flux(self._fluxes[name])
--- a/plesna/models/storage.py
+++ b/plesna/models/storage.py
@ -25,6 +25,10 @@ class Table(BaseModel):
    schema_id: id of the schema where table belong to
    name: the name of the table
    value: string which describe where to find the table in the storage system
+
+    partitions: list of partitions
+    datas: list of string to access data
+
    """

    id: str
@ -33,6 +37,7 @@ class Table(BaseModel):
    name: str
    value: str
    partitions: list[str] = []
+    datas: list[str]


 class Partition(BaseModel):
--- a/plesna/storage/fake_datacatalogue.py
+++ b/plesna/storage/fake_datacatalogue.py
@ -1,81 +0,0 @@
-from pathlib import Path
-
-from pydantic import BaseModel, computed_field
-
-from plesna.models.storage import Schema, Table
-
-from .datacatalogue import DataCatalogue
-
-
-class FakeSchema(BaseModel):
-    name: str
-
-    @computed_field
-    @property
-    def ref(self) -> Schema:
-        return Schema(
-            id=str(self.name),
-            value=str(self.name),
-        )
-
-
-class FakeTable(BaseModel):
-    name: str
-    data: dict[str, list]
-
-    @computed_field
-    @property
-    def ref(self) -> Table:
-        return Table(
-            id=str(self.name),
-            value=str(self.name),
-        )
-
-
-class FakeDataCatalogue(DataCatalogue):
-    """DataCatalogue based on dictionnaries"""
-
-    def __init__(self, name: str):
-        self.name = name
-
-    def ls(
-        self, dir="", only_files=False, only_directories=False, recursive=False
-    ) -> list[str]:
-        dirpath = self._basepath / dir
-
-        if only_files:
-            return [
-                str(f.relative_to(dirpath))
-                for f in dirpath.iterdir()
-                if not f.is_dir() and not str(f).startswith(".")
-            ]
-
-        if only_directories:
-            if recursive:
-                return [
-                    str(f[0].relative_to(dirpath))
-                    for f in dirpath.walk()
-                    if not str(f).startswith(".")
-                ]
-
-            return [
-                str(f.relative_to(dirpath))
-                for f in dirpath.iterdir()
-                if f.is_dir() and not str(f).startswith(".")
-            ]
-
-        return [
-            str(f.relative_to(dirpath))
-            for f in dirpath.iterdir()
-            if not str(f).startswith(".")
-        ]
-
-    def schemas(self) -> dict[str, FSSchema]:
-        """List schemas (sub directories within basepath)"""
-        subdirectories = self.ls("", only_directories=True, recursive=True)
-        return {str(path): FSSchema(path=path) for path in subdirectories}
-
-    def tables(self, schema_id=".") -> dict[str, FSTable]:
-        """List table in schema (which are files in the directory)"""
-        schema_path = schema_id
-        return {path: FSTable(path=path) for path in self.ls(schema_path, only_files=True)}
--- a/plesna/storage/repository/fs_repository.py
+++ b/plesna/storage/repository/fs_repository.py
@ -32,6 +32,11 @@ class FSTable(BaseModel):
    @computed_field
    @property
    def ref(self) -> Table:
+        if self.is_partitionned:
+            datas = [str(self.path.absolute() / p) for p in self.partitions]
+        else:
+            datas = [str(self.path.absolute())]
+
        return Table(
            id=str(self.path),
            repo_id=str(self.path.parent.parent),
@ -39,6 +44,7 @@ class FSTable(BaseModel):
            name=self.name,
            value=str(self.path.absolute()),
            partitions=self.partitions,
+            datas=datas,
        )


@ -75,9 +81,7 @@ class FSRepository(Repository):

        assert self._basepath.exists()

-    def ls(
-        self, dir="", only_files=False, only_directories=False, recursive=False
-    ) -> list[str]:
+    def ls(self, dir="", only_files=False, only_directories=False, recursive=False) -> list[str]:
        """List files in dir

        :param dir: relative path from self._basepath
@ -106,9 +110,7 @@ class FSRepository(Repository):
                if f.is_dir() and not str(f).startswith(".")
            ]

-        return [
-            str(f.relative_to(dirpath)) for f in paths if not str(f).startswith(".")
-        ]
+        return [str(f.relative_to(dirpath)) for f in paths if not str(f).startswith(".")]

    def schemas(self) -> list[str]:
        """List schemas (sub directories within basepath)"""
--- a/tests/compute/test_consume_flux.py
+++ b/tests/compute/test_consume_flux.py
@ -7,18 +7,18 @@ from plesna.models.transformation import Transformation
 def test_consume_flux():
    sources = {
        "src1": Table(
-            id="src1", repo_id="test", schema_id="test", name="test", value="here"
+            id="src1", repo_id="test", schema_id="test", name="test", value="here", datas=["d"]
        ),
        "src2": Table(
-            id="src2", repo_id="test", schema_id="test", name="test", value="here"
+            id="src2", repo_id="test", schema_id="test", name="test", value="here", datas=["d"]
        ),
    }
    targets = {
        "tgt1": Table(
-            id="tgt1", repo_id="test", schema_id="test", name="test", value="this"
+            id="tgt1", repo_id="test", schema_id="test", name="test", value="this", datas=["d"]
        ),
        "tgt2": Table(
-            id="tgt2", repo_id="test", schema_id="test", name="test", value="that"
+            id="tgt2", repo_id="test", schema_id="test", name="test", value="that", datas=["d"]
        ),
    }

--- a/tests/dataplatform/test_dataplateform.py
+++ b/tests/dataplatform/test_dataplateform.py
@ -4,6 +4,8 @@ from pathlib import Path
 import pytest

 from plesna.dataplatform import DataPlateform
+from plesna.models.flux import Flux
+from plesna.models.transformation import Transformation
 from plesna.storage.repository.fs_repository import FSRepository

 FIXTURE_DIR = Path(__file__).parent.parent / Path("raw_datas")
@ -27,7 +29,7 @@ def repository(tmp_path) -> FSRepository:
    for f in example_src.glob("*"):
        if "recovery" in str(f):
            shutil.copy(f, recovery_loc)
-        if "salary" in str(f):
+        elif "salary" in str(f):
            shutil.copy(f, salary_loc)
        else:
            shutil.copy(f, username_loc)
@ -51,12 +53,70 @@ def test_add_repository(
    assert dp.repository("test") == repository


+@pytest.fixture
+def foo_flux(repository: FSRepository) -> Flux:
+    src = {"username": repository.table("raw", "username")}
+    targets = {"username": repository.table("bronze", "username")}
+
+    def foo(sources, targets):
+        return {"who": "foo"}
+
+    extra_kwrds = {}
+
+    flux = Flux(
+        sources=src,
+        targets=targets,
+        transformation=Transformation(function=foo, extra_kwrds=extra_kwrds),
+    )
+    return flux
+
+
+@pytest.fixture
+def copy_flux(repository: FSRepository) -> Flux:
+    raw_username = {"username": repository.table("raw", "username")}
+    bronze_username = {"username": repository.table("bronze", "username")}
+
+    def copy(sources, targets):
+        src_path = Path(sources["username"].datas[0])
+        tgt_path = Path(targets["username"].datas[0])
+        shutil.copy(src_path, tgt_path)
+        return {"src_size": src_path.stat().st_size, "tgt_size": tgt_path.stat().st_size}
+
+    extra_kwrds = {}
+
+    raw_brz_copy_username = Flux(
+        sources=raw_username,
+        targets=bronze_username,
+        transformation=Transformation(function=copy, extra_kwrds=extra_kwrds),
+    )
+    return raw_brz_copy_username
+
+
+def test_add_flux(repository: FSRepository, copy_flux: Flux):
+    dataplatform = DataPlateform()
+    dataplatform.add_repository("test", repository)
+
+    dataplatform.add_flux(name="copy_flux", flux=copy_flux)
+    assert dataplatform.fluxes == ["copy_flux"]
+    dataplatform.add_flux(name="copy_flux_bis", flux=copy_flux)
+    assert dataplatform.fluxes == ["copy_flux", "copy_flux_bis"]
+
+    assert dataplatform.flux("copy_flux") == copy_flux
+    assert dataplatform.flux("copy_flux_bis") == copy_flux
+
+
@pytest.fixture
 def dataplatform(
    repository: FSRepository,
+    foo_flux: Flux,
+    copy_flux: Flux,
 ) -> DataPlateform:
    dp = DataPlateform()
+
    dp.add_repository("test", repository)
+
+    dp.add_flux("foo", foo_flux)
+    dp.add_flux("raw_brz_copy_username", copy_flux)
    return dp


@ -67,8 +127,16 @@ def test_listing_content(dataplatform: DataPlateform):
        "username",
        "salary",
    ]
+    assert dataplatform.repository("test").table("raw", "username").partitions == ["username.csv"]


-def test_add_flux(dataplatform: DataPlateform):
-    # dataplatform.add_flux()
-    pass
+def test_execute_flux(dataplatform: DataPlateform):
+    meta = dataplatform.execute_flux("foo")
+    assert meta.data == {"who": "foo"}
+
+    assert dataplatform.repository("test").schema("bronze").tables == []
+
+    meta = dataplatform.execute_flux("raw_brz_copy_username")
+    assert meta.data == {"src_size": 175, "tgt_size": 175}
+
+    assert dataplatform.repository("test").schema("bronze").tables == ["username"]
Author	SHA1	Message	Date
Bertrand Benjamin	86f0dcc49e	Feat: execute flux on dataplatform	2025-01-04 21:33:05 +01:00
Bertrand Benjamin	d04bfe1d44	Feat: add execute_flux	2025-01-04 15:30:32 +01:00
Bertrand Benjamin	1446c166ca	Feat: add flux in dataplatform	2025-01-04 13:51:24 +01:00