Feat: test on pandas xlsx and ods file reader

Feat: start testing fs_repository
Feat: test consume_flux
2024-10-07 05:27:46 +02:00 · 2024-08-14 10:44:38 +02:00 · 2024-08-14 07:41:36 +02:00 · 2024-08-14 07:22:01 +02:00 · 2024-08-14 07:21:36 +02:00 · 2024-08-07 11:39:33 +02:00
42 changed files with 150 additions and 1161 deletions
--- a/66
+++ b/66
@@ -0,0 +1,66 @@
+DATA_BASE=./datas
+
+PDF_BASE=$(DATA_BASE)/pdfs
+PDF_YEARS=$(wildcard $(PDF_BASE)/*)
+
+RAW_BASE=$(DATA_BASE)/raw
+RAW_CRG=$(RAW_BASE)/CRG
+RAW_CRG_YEARS=$(subst $(PDF_BASE), $(RAW_CRG), $(PDF_YEARS))
+
+
+$(RAW_CRG)/%/: $(wildcard $(PDF_BASE)/%/*)
+	echo $(wildcard $(PDF_BASE)/$*/*)
+	@echo ----
+	ls $(PDF_BASE)/$*/
+	@echo ----
+	echo $*
+	@echo ----
+	echo $^
+	@echo ----
+	echo $?
+
+#./datas/raw/CRG/%: 
+#pdf-oralia extract all --src $$year --dest $$(subst $$PDF_BASE, $$RAW_CRG, $$year)
+# $(RAW_CRG_YEARS): $(PDF_PATHS)
+# 	for year in $(PDF_PATHS); do \
+# 		echo $$year; \
+# 		echo $$(subst $$PDF_BASE, $$RAW_CRG, $$year); \
+# 		echo "----"; \
+# 	done;
+
+extract_pdfs:
+	for year in 2021 2022 2023 2024; do \
+		mkdir -p $(RAW_CRG)/$$year/extracted;\
+		pdf-oralia extract all --src $(PDF_BASE)/$$year/ --dest $(RAW_CRG)/$$year/extracted; \
+		pdf-oralia join --src $(RAW_CRG)/$$year/extracted/ --dest $(RAW_CRG)/$$year/; \
+	done
+
+clean_raw:
+	rm -rf ./PLESNA Compta SYSTEM/raw/**/*.csv
+
+clean_built:
+	rm -rf $(DATA_BASE)/staging/**/*.csv
+	rm -rf $(DATA_BASE)/gold/**/*.csv
+	rm -rf $(DATA_BASE)/datamart/**/*.csv
+	rm -rf $(DATA_BASE)/datamart/**/*.xlsx
+
+run_ingest:
+	python -m scripts ingest
+
+run_feature:
+	python -m scripts feature
+
+run_datamart:
+	python -m scripts datamart
+
+build: clean_built run_ingest run_feature run_datamart
+
+clean_all: clean_built clean_raw
+
+import_nextcloud:
+	rsync -av ~/Nextcloud/PLESNA\ Compta\ SYSTEM/Histoire/ ./datas/Histoire
+
+push_nextcloud:
+	rsync -av ./datas/datamart/ ~/Nextcloud/PLESNA\ Compta\ SYSTEM/DataMart
+
+
--- a/README.md
+++ b/README.md
@@ -1,15 +1,5 @@
 # E(T)LT pour Plesna

-## Installation
-
-## Concepts
-
- `dataplatform`: agrégation d'un datacatalogue, de moteur de compute et du dag des transformations.
- `datacatalogue`: gestion du contenu des datastores.
- `datastore`: interface de stockage des données.
- `compute`: moteur de traitement des fluxs.
- `graph/dag`: organisation logique des fluxs et des données.
-
 ## Stages

 - Raw: fichiers les plus brutes possibles
--- a/plesna/compute/init.py
+++ b/plesna/compute/init.py
--- a/plesna/compute/consume_flux.py
+++ b/plesna/compute/consume_flux.py
@@ -1,8 +0,0 @@
-from plesna.models.flux import Flux, FluxMetaData
-
-
-def consume_flux(flux: Flux) -> FluxMetaData:
-    metadata = flux.transformation.function(
-        sources=flux.sources_dict, targets=flux.targets_dict, **flux.transformation.extra_kwrds
-    )
-    return FluxMetaData(data=metadata)
--- a/plesna/dataplatform.py
+++ b/plesna/dataplatform.py
@@ -1,68 +0,0 @@
-from plesna.compute.consume_flux import consume_flux
-from plesna.graph.graph import Graph, Node
-from plesna.graph.graph_set import EdgeOnSet, GraphSet
-from plesna.models.flux import Flux, FluxMetaData
-from plesna.models.libs.flux_graph import flux_to_edgeonset
-from plesna.storage.repository.repository import Repository
-
-
-class DataPlateformError(Exception):
-    pass
-
-
-class DataPlateform:
-    def __init__(self):
-        self._metadata_engine = ""
-        self._fluxes = {}
-        self._repositories = {}
-
-    def add_repository(self, repository: Repository) -> str:
-        if repository.id in self._repositories:
-            raise DataPlateformError("The repository {repository.id} already exists")
-
-        self._repositories[repository.id] = repository
-        return repository.id
-
-    @property
-    def repositories(self) -> list[str]:
-        return list(self._repositories)
-
-    def repository(self, id: str) -> Repository:
-        return self._repositories[id]
-
-    def add_flux(self, name: str, flux: Flux) -> str:
-        if name in self._fluxes:
-            raise DataPlateformError("The flux {name} already exists")
-
-        self._fluxes[name] = flux
-        return name
-
-    @property
-    def fluxes(self) -> list[str]:
-        return list(self._fluxes)
-
-    def flux(self, name: str) -> Flux:
-        return self._fluxes[name]
-
-    def execute_flux(self, name: str) -> FluxMetaData:
-        if name not in self._fluxes:
-            raise DataPlateformError("The flux {name} is not registered")
-        return consume_flux(self._fluxes[name])
-
-    @property
-    def graphset(self) -> GraphSet:
-        graphset = GraphSet()
-        for flux in self._fluxes.values():
-            edge = flux_to_edgeonset(flux)
-            graphset.append(edge)
-
-        return graphset
-
-    @property
-    def graph(self) -> Graph:
-        graph = Graph()
-        for repo in self._repositories.values():
-            for schema in repo.schemas():
-                for table in repo.tables(schema):
-                    graph.add_node(Node(name=table))
-        return graph
--- a/plesna/graph/init.py
+++ b/plesna/graph/init.py
--- a/plesna/graph/graph.py
+++ b/plesna/graph/graph.py
@@ -1,82 +0,0 @@
-from pydantic import BaseModel
-from functools import reduce
-from plesna.models.graphs import Node, Edge
-
-
-class Graph:
-    def __init__(self, nodes: list[Node] = [], edges: list[Edge] = []):
-        self._edges = []
-        self._nodes = set()
-        self.add_edges(edges)
-        self.add_nodes(nodes)
-
-    def add_node(self, node: Node):
-        self._nodes.add(node)
-
-    def add_nodes(self, nodes: list[Node]):
-        for node in nodes:
-            self.add_node(node)
-
-    def add_edge(self, edge: Edge):
-        self._edges.append(edge)
-        self.add_node(edge.source)
-        self.add_node(edge.target)
-
-    def add_edges(self, edges: list[Edge]):
-        for edge in edges:
-            self.add_edge(edge)
-
-    @property
-    def nodes(self):
-        return self._nodes
-
-    @property
-    def edges(self):
-        return self._edges
-
-    def get_edges_from(self, node: Node) -> list[Edge]:
-        """Get all edges which have the node as source"""
-        return [edge for edge in self._edges if edge.source == node]
-
-    def get_edges_to(self, node: Node) -> list[Edge]:
-        """Get all edges which have the node as target"""
-        return [edge for edge in self._edges if edge.target == node]
-
-    def get_direct_targets_from(self, node: Node) -> set[Node]:
-        """Get direct nodes that are accessible from the node"""
-        return set(edge.target for edge in self._edges if edge.source == node)
-
-    def get_targets_from(self, node: Node) -> set[Node]:
-        """Get all nodes that are accessible from the node
-
-        If the graph have a loop, the procedure be in an infinite loop!
-
-        """
-        direct_targets = self.get_direct_targets_from(node)
-        undirect_targets = [self.get_targets_from(n) for n in direct_targets]
-        undirect_targets = reduce(lambda x, y: x.union(y), undirect_targets, set())
-
-        return direct_targets.union(undirect_targets)
-
-    def get_direct_sources_from(self, node: Node) -> set[Node]:
-        """Get direct nodes that are targeted the node"""
-        return set(edge.source for edge in self._edges if edge.target == node)
-
-    def get_sources_from(self, node: Node) -> set[Node]:
-        """Get all nodes that are targeted the node"""
-        direct_sources = self.get_direct_sources_from(node)
-        undirect_sources = [self.get_sources_from(n) for n in direct_sources]
-        undirect_sources = reduce(lambda x, y: x.union(y), undirect_sources, set())
-
-        return direct_sources.union(undirect_sources)
-
-    def is_dag(self) -> bool:
-        visited = set()
-        for node in self._nodes:
-            if node not in visited:
-                try:
-                    targets = self.get_targets_from(node)
-                except RecursionError:
-                    return False
-                visited.union(targets)
-        return True
--- a/plesna/graph/graph_set.py
+++ b/plesna/graph/graph_set.py
@@ -1,35 +0,0 @@
-from typing import Set
-from plesna.graph.graph import Graph
-from plesna.models.graphs import Edge, EdgeOnSet
-from itertools import product
-
-
-class GraphSet:
-    def __init__(self):
-        self._edges = []
-        self._node_sets = set()
-
-    def append(self, edge: EdgeOnSet):
-        self._edges.append(edge)
-        self._node_sets.add(frozenset(edge.sources))
-        self._node_sets.add(frozenset(edge.targets))
-
-    @property
-    def node_sets(self) -> Set[frozenset]:
-        return self._node_sets
-
-    def to_graph(self) -> Graph:
-        graph = Graph()
-        for node_set in self.node_sets:
-            graph.add_nodes(node_set)
-        for edge in self._edges:
-            flatten_edge = [
-                Edge(arrow=edge.arrow, source=s, target=t, edge_kwrds=edge.edge_kwrds)
-                for (s, t) in product(edge.sources, edge.targets)
-            ]
-            graph.add_edges(flatten_edge)
-
-        return graph
-
-    def is_valid_dag(self) -> bool:
-        return self.to_graph().is_dag()
--- a/plesna/libs/init.py
+++ b/plesna/libs/init.py
--- a/plesna/libs/string_tools.py
+++ b/plesna/libs/string_tools.py
@@ -1,18 +0,0 @@
-import re
-
-
-class StringToolsError(ValueError):
-    pass
-
-
-def extract_values_from_pattern(pattern, string):
-    regex = re.sub(r"{(.+?)}", r"(?P<_\1>.+)", pattern)
-
-    search = re.search(regex, string)
-    if search:
-        values = list(search.groups())
-        keys = re.findall(r"{(.+?)}", pattern)
-        _dict = dict(zip(keys, values))
-        return _dict
-
-    raise StringToolsError(f"Can't parse '{string}' with the pattern '{pattern}'")
--- a/plesna/models/init.py
+++ b/plesna/models/init.py
--- a/plesna/models/flux.py
+++ b/plesna/models/flux.py
@@ -1,48 +0,0 @@
-from collections.abc import Callable
-from pydantic import BaseModel, computed_field
-
-from plesna.models.storage import Table
-
-
-class Transformation(BaseModel):
-    """
-    The function have to have at least 2 arguments: sources and targets
-    Other arguments will came throught extra_kwrds
-
-    The function will have to return metadata as dict
-    """
-
-    function: Callable
-    extra_kwrds: dict = {}
-
-
-class Flux(BaseModel):
-    id: str
-    name: str
-    sources: list[Table]
-    targets: list[Table]
-    transformation: Transformation
-
-    @computed_field
-    @property
-    def sources_dict(self) -> dict[str, Table]:
-        return {s.id: s for s in self.sources}
-
-    @computed_field
-    @property
-    def sources_id(self) -> dict[str, Table]:
-        return [s.id for s in self.sources]
-
-    @computed_field
-    @property
-    def targets_id(self) -> dict[str, Table]:
-        return [s.id for s in self.targets]
-
-    @computed_field
-    @property
-    def targets_dict(self) -> dict[str, Table]:
-        return {s.id: s for s in self.targets}
-
-
-class FluxMetaData(BaseModel):
-    data: dict
--- a/plesna/models/graphs.py
+++ b/plesna/models/graphs.py
@@ -1,22 +0,0 @@
-from pydantic import BaseModel
-
-
-class Node(BaseModel):
-    name: str
-
-    def __hash__(self):
-        return hash(self.name)
-
-
-class Edge(BaseModel):
-    arrow: str
-    source: Node
-    target: Node
-    edge_kwrds: dict = {}
-
-
-class EdgeOnSet(BaseModel):
-    arrow: str
-    sources: list[Node]
-    targets: list[Node]
-    edge_kwrds: dict = {}
--- a/plesna/models/storage.py
+++ b/plesna/models/storage.py
@@ -1,60 +0,0 @@
-from pydantic import BaseModel
-
-
-class Schema(BaseModel):
-    """Where multiple tables are stored
-
-    id: uniq identifier for the schema
-    repo_id: id of the repo where the schema belong to
-    name: name of the schema
-    value: string which describe where to find the schema in the repository
-    """
-
-    id: str
-    repo_id: str
-    name: str
-    value: str
-    tables: list[str] = []
-
-
-class Table(BaseModel):
-    """Place where same structured data are stored
-
-    id: uniq identifier for the table
-    repo_id: id of the repo where the table belong to
-    schema_id: id of the schema where table belong to
-    name: the name of the table
-    value: string which describe where to find the table in the storage system
-
-    partitions: list of partitions
-    datas: list of string to access data
-
-    """
-
-    id: str
-    repo_id: str
-    schema_id: str
-    name: str
-    value: str
-    partitions: list[str] = []
-    datas: list[str]
-
-
-class Partition(BaseModel):
-    """Place where data are stored
-
-    id: uniq identifier for the table
-    repo_id: id of the repo where the table belong to
-    schema_id: id of the schema where table belong to
-    table_id: id of the schema where table belong to
-    name: the name of the partition
-    value: string which describe where to find the partition in the storage system
-
-    """
-
-    id: str
-    repo_id: str
-    schema_id: str
-    table_id: str
-    name: str
-    value: str
--- a/plesna/storage/init.py
+++ b/plesna/storage/init.py
--- a/plesna/storage/datacatalogue.py
+++ b/plesna/storage/datacatalogue.py
@@ -1,24 +0,0 @@
-import abc
-
-from plesna.models.storage import Schema
-
-
-class DataCatalogue:
-    def __init__(self):
-        pass
-
-    @property
-    @abc.abstractmethod
-    def schemas(self) -> list[str]:
-        """List schema's names"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def schema(self, name: str) -> Schema:
-        """Get the schema properties"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def tables(self, schema: str) -> list[str]:
-        """List table's name in schema"""
-        raise NotImplementedError
--- a/plesna/storage/repository/init.py
+++ b/plesna/storage/repository/init.py
--- a/plesna/storage/repository/fs_repository.py
+++ b/plesna/storage/repository/fs_repository.py
@@ -1,197 +0,0 @@
-from pathlib import Path
-
-from pydantic import BaseModel, computed_field
-
-from plesna.libs.string_tools import extract_values_from_pattern
-from plesna.models.storage import Partition, Schema, Table
-from plesna.storage.repository.repository import Repository
-
-
-class FSTable(BaseModel):
-    name: str
-    repo_id: str
-    schema_id: str
-    id: str
-    path: Path
-    is_partitionned: bool
-    partitions: list[str] = []
-
-    @computed_field
-    @property
-    def ref(self) -> Table:
-        if self.is_partitionned:
-            datas = [str(self.path.absolute() / p) for p in self.partitions]
-        else:
-            datas = [str(self.path.absolute())]
-
-        return Table(
-            id=self.id,
-            repo_id=self.repo_id,
-            schema_id=self.schema_id,
-            name=self.name,
-            value=str(self.path.absolute()),
-            partitions=self.partitions,
-            datas=datas,
-        )
-
-
-class FSSchema(BaseModel):
-    name: str
-    repo_id: str
-    id: str
-    path: Path
-    tables: list[str]
-
-    @computed_field
-    @property
-    def ref(self) -> Schema:
-        return Schema(
-            id=self.id,
-            repo_id=self.repo_id,
-            name=self.name,
-            value=str(self.path.absolute()),
-            tables=self.tables,
-        )
-
-
-class FSRepositoryError(ValueError):
-    pass
-
-
-class FSRepository(Repository):
-    """Repository based on files tree structure
-
-    - first level: schemas
-    - second level: tables
-    - third level: partition (actual datas)
-
-    """
-
-    ID_FMT = {
-        "schema": "{repo_id}-{schema_name}",
-        "table": "{schema_id}-{table_name}",
-    }
-
-    def __init__(self, id: str, name: str, basepath: str):
-        super().__init__(id, name)
-
-        self._basepath = Path(basepath)
-        assert self._basepath.exists()
-
-    def ls(self, dir="", only_files=False, only_directories=False, recursive=False) -> list[str]:
-        """List files in dir
-
-        :param dir: relative path from self._basepath
-        :param only_files: if true return only files
-        :param only_directories: if true return only directories
-        :param recursive: list content recursively (only for)
-        :return: list of string describing path from self._basepath / dir
-        """
-        dirpath = self._basepath / dir
-
-        if recursive:
-            paths = dirpath.rglob("*")
-        else:
-            paths = dirpath.iterdir()
-
-        if only_files:
-            return [
-                str(f.relative_to(dirpath))
-                for f in paths
-                if not f.is_dir() and not str(f).startswith(".")
-            ]
-        if only_directories:
-            return [
-                str(f.relative_to(dirpath))
-                for f in paths
-                if f.is_dir() and not str(f).startswith(".")
-            ]
-
-        return [str(f.relative_to(dirpath)) for f in paths if not str(f).startswith(".")]
-
-    def parse_id(self, string: str, id_type: str) -> dict:
-        if id_type not in self.ID_FMT:
-            raise FSRepositoryError(
-                "Wrong id_type. Gots {id_type} needs to be one of {self.ID_FMT.values}"
-            )
-        parsed = extract_values_from_pattern(self.ID_FMT[id_type], string)
-        if not parsed:
-            raise FSRepositoryError(
-                f"Wrong format for {id_type}. Got {string} need {self.ID_FMT['id_type']}"
-            )
-        return parsed
-
-    def schemas(self) -> list[str]:
-        """List schemas (sub directories within basepath)"""
-        subdirectories = self.ls("", only_directories=True)
-        return [
-            self.ID_FMT["schema"].format(repo_id=self.id, schema_name=d) for d in subdirectories
-        ]
-
-    def _schema(self, schema_id: str) -> FSSchema:
-        """List schemas (sub directories within basepath)"""
-        parsed = self.parse_id(schema_id, "schema")
-
-        repo_id = parsed["repo_id"]
-        schema_name = parsed["schema_name"]
-        schema_path = self._basepath / schema_name
-
-        if repo_id != self.id:
-            raise FSRepositoryError("Trying to get schema that don't belong in this repository")
-
-        tables = self.tables(schema_id)
-        return FSSchema(
-            name=schema_name,
-            id=schema_id,
-            repo_id=self.id,
-            schema_id=schema_id,
-            path=schema_path,
-            tables=tables,
-        )
-
-    def schema(self, schema_id: str) -> Schema:
-        return self._schema(schema_id).ref
-
-    def _tables(self, schema_id: str) -> list[str]:
-        parsed = self.parse_id(schema_id, "schema")
-        tables = self.ls(parsed["schema_name"])
-        return [self.ID_FMT["table"].format(table_name=t, schema_id=schema_id) for t in tables]
-
-    def tables(self, schema_id: str = "") -> list[str]:
-        if schema_id:
-            return self._tables(schema_id)
-
-        tables = []
-        for schema in self.schemas():
-            tables += self._tables(schema)
-        return tables
-
-    def _table(self, table_id: str) -> FSTable:
-        """Get infos on the table"""
-        parsed = self.parse_id(table_id, "table")
-        schema = self._schema(parsed["schema_id"])
-
-        if not schema.path.exists():
-            raise FSRepositoryError(f"The schema {schema.id} does not exists.")
-
-        table_subpath = f"{schema.name}/{parsed['table_name']}"
-        table_path = self._basepath / table_subpath
-
-        is_partitionned = table_path.is_dir()
-        if is_partitionned:
-            partitions = self.ls(table_subpath, only_files=True)
-        else:
-            partitions = []
-
-        return FSTable(
-            name=parsed["table_name"],
-            id=table_id,
-            repo_id=self.id,
-            schema_id=schema.id,
-            path=table_path,
-            is_partitionned=is_partitionned,
-            partitions=partitions,
-        )
-
-    def table(self, table_id: str) -> Table:
-        return self._table(table_id).ref
--- a/plesna/storage/repository/repository.py
+++ b/plesna/storage/repository/repository.py
@@ -1,37 +0,0 @@
-import abc
-
-from plesna.models.storage import Partition, Schema, Table
-
-
-class Repository:
-    def __init__(self, id: str, name: str):
-        self._id = id
-        self._name = name
-
-    @property
-    def id(self) -> str:
-        return self._id
-
-    @property
-    def name(self) -> str:
-        return self._name
-
-    @abc.abstractmethod
-    def schemas(self) -> list[str]:
-        """List schema's ids"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def schema(self, schema_id: str) -> Schema:
-        """Get the schema properties"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def tables(self, schema_id: str) -> list[str]:
-        """List table's name in schema (the id)"""
-        raise NotImplementedError
-
-    @abc.abstractmethod
-    def table(self, table_id: str) -> Table:
-        """Get the table properties (the id)"""
-        raise NotImplementedError
--- a/tests/compute/init.py
+++ b/tests/compute/init.py
--- a/tests/compute/test_consume_flux.py
+++ b/tests/compute/test_consume_flux.py
@@ -1,36 +0,0 @@
-from plesna.compute.consume_flux import consume_flux
-from plesna.models.flux import Flux, Transformation
-from plesna.models.storage import Table
-
-
-def test_consume_flux():
-    sources = [
-        Table(id="src1", repo_id="test", schema_id="test", name="test", value="here", datas=["d"]),
-        Table(id="src2", repo_id="test", schema_id="test", name="test", value="here", datas=["d"]),
-    ]
-    targets = [
-        Table(id="tgt1", repo_id="test", schema_id="test", name="test", value="this", datas=["d"]),
-        Table(id="tgt2", repo_id="test", schema_id="test", name="test", value="that", datas=["d"]),
-    ]
-
-    def func(sources, targets, **kwrds):
-        return {
-            "sources": len(sources),
-            "targets": len(targets),
-            "kwrds": len(kwrds),
-        }
-
-    flux = Flux(
-        id="flux",
-        name="flux",
-        sources=sources,
-        targets=targets,
-        transformation=Transformation(function=func, extra_kwrds={"extra": "super"}),
-    )
-
-    meta = consume_flux(flux)
-    assert meta.data == {
-        "sources": 2,
-        "targets": 2,
-        "kwrds": 1,
-    }
--- a/tests/dataplatform/test_dataplateform.py
+++ b/tests/dataplatform/test_dataplateform.py
@@ -1,171 +0,0 @@
-import shutil
-from pathlib import Path
-
-import pytest
-
-from plesna.dataplatform import DataPlateform
-from plesna.models.graphs import Node
-from plesna.models.flux import Flux, Transformation
-from plesna.storage.repository.fs_repository import FSRepository
-
-FIXTURE_DIR = Path(__file__).parent.parent / Path("raw_datas")
-
-
-@pytest.fixture
-def repository(tmp_path) -> FSRepository:
-    example_src = FIXTURE_DIR
-    assert example_src.exists()
-
-    raw_path = Path(tmp_path) / "raw"
-    shutil.copytree(src=example_src.absolute(), dst=raw_path.absolute())
-
-    bronze_path = Path(tmp_path) / "bronze"
-    bronze_path.mkdir()
-    silver_path = Path(tmp_path) / "silver"
-    silver_path.mkdir()
-
-    return FSRepository("test", "test", tmp_path)
-
-
-def test_add_repository(
-    repository: FSRepository,
-):
-    dp = DataPlateform()
-    dp.add_repository(repository)
-
-    assert dp.repositories == ["test"]
-
-    assert dp.repository("test") == repository
-
-
-@pytest.fixture
-def copy_flux(repository: FSRepository) -> Flux:
-    raw_username = [repository.table("test-raw-username")]
-    bronze_username = [repository.table("test-bronze-username")]
-
-    def copy(sources, targets):
-        src_path = Path(sources["test-raw-username"].datas[0])
-        tgt_path = Path(targets["test-bronze-username"].datas[0])
-        shutil.copy(src_path, tgt_path)
-        return {"src_size": src_path.stat().st_size, "tgt_size": tgt_path.stat().st_size}
-
-    extra_kwrds = {}
-
-    raw_brz_copy_username = Flux(
-        id="copy_flux",
-        name="copy",
-        sources=raw_username,
-        targets=bronze_username,
-        transformation=Transformation(function=copy, extra_kwrds=extra_kwrds),
-    )
-    return raw_brz_copy_username
-
-
-@pytest.fixture
-def foo_flux(repository: FSRepository) -> Flux:
-    src = [
-        repository.table("test-raw-username"),
-        repository.table("test-raw-recovery"),
-    ]
-    targets = [repository.table("test-bronze-foo")]
-
-    def foo(sources, targets):
-        return {"who": "foo"}
-
-    extra_kwrds = {}
-
-    flux = Flux(
-        id="foo_flux",
-        name="foo",
-        sources=src,
-        targets=targets,
-        transformation=Transformation(function=foo, extra_kwrds=extra_kwrds),
-    )
-    return flux
-
-
-def test_add_flux(repository: FSRepository, copy_flux: Flux):
-    dataplatform = DataPlateform()
-    dataplatform.add_repository(repository)
-
-    dataplatform.add_flux(name="copy_flux", flux=copy_flux)
-    assert dataplatform.fluxes == ["copy_flux"]
-    dataplatform.add_flux(name="copy_flux_bis", flux=copy_flux)
-    assert dataplatform.fluxes == ["copy_flux", "copy_flux_bis"]
-
-    assert dataplatform.flux("copy_flux") == copy_flux
-    assert dataplatform.flux("copy_flux_bis") == copy_flux
-
-
-@pytest.fixture
-def dataplatform(
-    repository: FSRepository,
-    foo_flux: Flux,
-    copy_flux: Flux,
-) -> DataPlateform:
-    dp = DataPlateform()
-
-    dp.add_repository(repository)
-
-    dp.add_flux("foo", foo_flux)
-    dp.add_flux("raw_brz_copy_username", copy_flux)
-    return dp
-
-
-def test_listing_content(dataplatform: DataPlateform):
-    assert dataplatform.repository("test").schemas() == ["test-raw", "test-bronze", "test-silver"]
-    assert dataplatform.repository("test").schema("test-raw").tables == [
-        "test-raw-username",
-        "test-raw-recovery",
-        "test-raw-salary",
-    ]
-    assert dataplatform.repository("test").table("test-raw-username").partitions == ["username.csv"]
-    assert dataplatform.repository("test").table("test-raw-recovery").partitions == [
-        "2022.csv",
-        "2023.csv",
-        "2024.csv",
-    ]
-
-
-def test_content_from_graph(dataplatform: DataPlateform):
-    assert dataplatform.graph.nodes == {
-        Node(name="test-raw-recovery", infos={}),
-        Node(name="test-raw-salary", infos={}),
-        Node(name="test-raw-username", infos={}),
-    }
-
-    assert dataplatform.graphset.node_sets == {
-        frozenset(
-            {
-                Node(name="test-bronze-username"),
-            }
-        ),
-        frozenset(
-            {
-                Node(name="test-bronze-foo"),
-            }
-        ),
-        frozenset(
-            {
-                Node(name="test-raw-username"),
-            }
-        ),
-        frozenset(
-            {
-                Node(name="test-raw-username"),
-                Node(name="test-raw-recovery"),
-            }
-        ),
-    }
-
-
-def test_execute_flux(dataplatform: DataPlateform):
-    meta = dataplatform.execute_flux("foo")
-    assert meta.data == {"who": "foo"}
-
-    assert dataplatform.repository("test").schema("test-bronze").tables == []
-
-    meta = dataplatform.execute_flux("raw_brz_copy_username")
-    assert meta.data == {"src_size": 283, "tgt_size": 283}
-
-    assert dataplatform.repository("test").schema("test-bronze").tables == ["test-bronze-username"]
--- a/tests/e2e/test_datalake.py
+++ b/tests/e2e/test_datalake.py
@@ -1,39 +0,0 @@
-from pathlib import Path
-
-import pytest
-
-from plesna.dataplatform import DataPlateform
-from plesna.datastore.fs_datacatalogue import FSDataCatalogue
-
-FIXTURE_DIR = Path(__file__).parent / Path("raw_data")
-
-
-@pytest.fixture
-def raw_catalogue(tmp_path):
-    raw_path = Path(tmp_path) / "raw"
-    return FSDataCatalogue(raw_path)
-
-
-@pytest.fixture
-def bronze_catalogue(tmp_path):
-    bronze_path = Path(tmp_path) / "bronze"
-    return FSDataCatalogue(bronze_path)
-
-
-@pytest.fixture
-def silver_catalogue(tmp_path):
-    silver_path = Path(tmp_path) / "silver"
-    return FSDataCatalogue(silver_path)
-
-
-@pytest.fixture
-def dataplateform(
-    raw_catalogue: FSDataCatalogue,
-    bronze_catalogue: FSDataCatalogue,
-    silver_catalogue: FSDataCatalogue,
-):
-    dp = DataPlateform()
-    dp.add_datacatalague("raw", raw_catalogue)
-    dp.add_datacatalague("bronze", bronze_catalogue)
-    dp.add_datacatalague("silver", silver_catalogue)
-    pass
--- a/tests/graphs/init.py
+++ b/tests/graphs/init.py
--- a/tests/graphs/test_graph.py
+++ b/tests/graphs/test_graph.py
@@ -1,106 +0,0 @@
-import pytest
-
-from plesna.graph.graph import Graph
-from plesna.models.graphs import Edge, Node
-
-
-def test_append_nodess():
-    nodeA = Node(name="A")
-    nodeB = Node(name="B")
-
-    graph = Graph()
-    graph.add_node(nodeA)
-    graph.add_node(nodeB)
-
-    assert graph.nodes == {nodeA, nodeB}
-
-
-def test_append_edges():
-    nodeA = Node(name="A")
-    nodeB = Node(name="B")
-    nodeC = Node(name="C")
-
-    edge1 = Edge(arrow="arrow", source=nodeA, target=nodeC)
-    edge2 = Edge(arrow="arrow", source=nodeB, target=nodeC)
-
-    graph = Graph()
-    graph.add_edge(edge1)
-    graph.add_edge(edge2)
-
-    assert graph.nodes == {nodeA, nodeB, nodeC}
-
-
-def test_init_edges_nodes():
-    nodeA = Node(name="A")
-    nodeB = Node(name="B")
-    nodeC = Node(name="C")
-
-    edge1 = Edge(arrow="arrow", source=nodeB, target=nodeC)
-
-    graph = Graph()
-    graph.add_node(nodeA)
-    graph.add_edge(edge1)
-
-    assert graph.nodes == {nodeA, nodeB, nodeC}
-
-
-@pytest.fixture
-def nodes():
-    return {
-        "A": Node(name="A"),
-        "B": Node(name="B"),
-        "C": Node(name="C"),
-        "D": Node(name="D"),
-    }
-
-
-@pytest.fixture
-def dag_edges(nodes):
-    return {
-        "1": Edge(arrow="arrow", source=nodes["A"], target=nodes["C"]),
-        "2": Edge(arrow="arrow", source=nodes["B"], target=nodes["C"]),
-        "3": Edge(arrow="arrow", source=nodes["C"], target=nodes["D"]),
-    }
-
-
-@pytest.fixture
-def notdag_edges(nodes):
-    return {
-        "1": Edge(arrow="arrow", source=nodes["A"], target=nodes["C"]),
-        "2": Edge(arrow="arrow", source=nodes["B"], target=nodes["C"]),
-        "3": Edge(arrow="arrow", source=nodes["C"], target=nodes["D"]),
-        "4": Edge(arrow="arrow", source=nodes["D"], target=nodes["B"]),
-    }
-
-
-def test_get_edges_from(nodes, dag_edges):
-    edges = dag_edges
-    graph = Graph(edges=edges.values())
-    assert graph.get_edges_from(nodes["A"]) == [edges["1"]]
-
-
-def test_get_targets_from(nodes, dag_edges):
-    edges = dag_edges
-    graph = Graph(edges=edges.values())
-    assert graph.get_direct_targets_from(nodes["A"]) == set([nodes["C"]])
-    assert graph.get_direct_targets_from(nodes["C"]) == set([nodes["D"]])
-    assert graph.get_direct_targets_from(nodes["D"]) == set()
-    assert graph.get_targets_from(nodes["A"]) == set([nodes["C"], nodes["D"]])
-
-
-def test_get_sources_from(nodes, dag_edges):
-    edges = dag_edges
-    graph = Graph(edges=edges.values())
-    assert graph.get_direct_sources_from(nodes["A"]) == set()
-    assert graph.get_direct_sources_from(nodes["C"]) == set([nodes["A"], nodes["B"]])
-    assert graph.get_direct_sources_from(nodes["D"]) == set([nodes["C"]])
-
-    assert graph.get_sources_from(nodes["D"]) == set([nodes["A"], nodes["B"], nodes["C"]])
-
-
-def test_valid_dage(dag_edges, notdag_edges):
-    graph = Graph(edges=dag_edges.values())
-    assert graph.is_dag()
-
-    graph = Graph(edges=notdag_edges.values())
-    assert not graph.is_dag()
--- a/tests/graphs/test_graph_set.py
+++ b/tests/graphs/test_graph_set.py
@@ -1,43 +0,0 @@
-from plesna.graph.graph import Graph
-from plesna.graph.graph_set import GraphSet
-from plesna.models.graphs import Edge, EdgeOnSet, Node
-
-
-def test_init():
-    graph_set = GraphSet()
-
-    nodeA = Node(name="A")
-    nodeB = Node(name="B")
-    nodeC = Node(name="C")
-    edge1 = EdgeOnSet(arrow="arrow", sources=[nodeA, nodeB], targets=[nodeC])
-
-    graph_set.append(edge1)
-
-    assert graph_set.node_sets == {frozenset([nodeA, nodeB]), frozenset([nodeC])}
-
-
-def test_to_graph():
-    graph_set = GraphSet()
-
-    nodeA = Node(name="A")
-    nodeB = Node(name="B")
-    nodeC = Node(name="C")
-    nodeD = Node(name="D")
-    edge1 = EdgeOnSet(arrow="arrow-AB-C", sources=[nodeA, nodeB], targets=[nodeC])
-    edge2 = EdgeOnSet(arrow="arrow-C-D", sources=[nodeC], targets=[nodeD])
-
-    graph_set.append(edge1)
-    graph_set.append(edge2)
-
-    graph = graph_set.to_graph()
-    assert graph.nodes == {
-        nodeA,
-        nodeB,
-        nodeC,
-        nodeD,
-    }
-    assert graph.edges == [
-        Edge(arrow="arrow-AB-C", source=nodeA, target=nodeC),
-        Edge(arrow="arrow-AB-C", source=nodeB, target=nodeC),
-        Edge(arrow="arrow-C-D", source=nodeC, target=nodeD),
-    ]
--- a/tests/libs/init.py
+++ b/tests/libs/init.py
--- a/tests/libs/test_string_tools.py
+++ b/tests/libs/test_string_tools.py
@@ -1,18 +0,0 @@
-import pytest
-
-from plesna.libs.string_tools import StringToolsError, extract_values_from_pattern
-
-
-def test_extract_values_from_pattern():
-    source = "id:truc-bidule-machin"
-    pattern = "id:{champ1}-{champ2}-machin"
-
-    assert extract_values_from_pattern(pattern, source) == {"champ1": "truc", "champ2": "bidule"}
-
-
-def test_extract_values_from_pattern_no_match():
-    source = "id:truc-bidule"
-    pattern = "id:{champ1}-{champ2}-machin"
-
-    with pytest.raises(StringToolsError):
-        extract_values_from_pattern(pattern, source)
--- a/tests/raw_datas/recovery/2022.csv
+++ b/tests/raw_datas/recovery/2022.csv
@@ -1,3 +0,0 @@
-Identifier,One-time password
-9012,12se74
-2070,04ap67
--- a/tests/raw_datas/recovery/2023.csv
+++ b/tests/raw_datas/recovery/2023.csv
@@ -1,4 +0,0 @@
-Identifier,One-time password
-9012,32ui83
-9346,14ju73
-5079,09ja61
--- a/tests/raw_datas/recovery/2024.csv
+++ b/tests/raw_datas/recovery/2024.csv
@@ -1,4 +0,0 @@
-Identifier,One-time password
-9012,74iu23
-2070,12io89
-5079,85nc83
--- a/tests/raw_datas/salary/salary.pdf
+++ b/tests/raw_datas/salary/salary.pdf
--- a/tests/raw_datas/username/username.csv
+++ b/tests/raw_datas/username/username.csv
@@ -1,6 +0,0 @@
-Username,Identifier,First name,Last name,Department,Location
-booker12,9012,Rachel,Booker,Sales,Manchester
-grey07,2070,Laura,Grey,Depot,London
-johnson81,4081,Craig,Johnson,Depot,London
-jenkins46,9346,Mary,Jenkins,Engineering,Manchester
-smith79,5079,Jamie,Smith,Engineering,Manchester
--- a/tests/repository/init.py
+++ b/tests/repository/init.py
--- a/tests/repository/fs_examples/salary.pdf
+++ b/tests/repository/fs_examples/salary.pdf
--- a/tests/repository/fs_examples/username-password-recovery-code.xls
+++ b/tests/repository/fs_examples/username-password-recovery-code.xls
--- a/tests/repository/fs_examples/username-password-recovery-code.xlsx
+++ b/tests/repository/fs_examples/username-password-recovery-code.xlsx
--- a/tests/repository/fs_examples/username.csv
+++ b/tests/repository/fs_examples/username.csv
--- a/tests/repository/test_fs_repository.py
+++ b/tests/repository/test_fs_repository.py
@@ -0,0 +1,84 @@
+import shutil
+from pathlib import Path
+
+import pytest
+from pandas import pandas
+
+from dashboard.libs.repository.fs_repository import FSRepository
+
+EXAMPLE_DIR = "./tests/repository/fs_examples/"
+
+
+@pytest.fixture
+def location(tmp_path):
+    loc = tmp_path
+    username_loc = loc / "username"
+    username_loc.mkdir()
+    salary_loc = loc / "salary"
+    salary_loc.mkdir()
+    example_src = Path(EXAMPLE_DIR)
+
+    for f in example_src.glob("*"):
+        if "username" in str(f):
+            shutil.copy(f, username_loc)
+        else:
+            shutil.copy(f, salary_loc)
+
+    return loc
+
+
+def test_init(location):
+    repo = FSRepository("example", location)
+    assert repo.ls() == [
+        "username",
+        "salary",
+    ]
+    assert repo.schemas() == [
+        ".",
+        "username",
+        "salary",
+    ]
+
+    assert repo.tables() == []
+    assert repo.tables("username") == [
+        "username.csv",
+        "username-password-recovery-code.xlsx",
+        "username-password-recovery-code.xls",
+    ]
+    assert repo.tables("salary") == ["salary.pdf"]
+
+
+def test_read_csv(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username.csv", "username", delimiter=";")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "First name",
+        "Last name",
+    ]
+    assert len(username.index) == 5
+
+
+def test_fake_read_xlsx(location):
+    repo = FSRepository("example", location)
+    df = pandas.read_excel(
+        location / "username" / "username-password-recovery-code.xls"
+    )
+    print(df)
+
+
+def test_read_xlsx(location):
+    repo = FSRepository("example", location)
+    username = repo.read("username-password-recovery-code.xls", "username")
+    assert list(username.columns) == [
+        "Username",
+        "Identifier",
+        "One-time password",
+        "Recovery code",
+        "First name",
+        "Last name",
+        "Department",
+        "Location",
+    ]
+    assert len(username.index) == 5
--- a/tests/storage/init.py
+++ b/tests/storage/init.py
--- a/tests/storage/test_fs_repository.py
+++ b/tests/storage/test_fs_repository.py
@@ -1,115 +0,0 @@
-import shutil
-from pathlib import Path
-
-import pytest
-
-from plesna.storage.repository.fs_repository import FSRepository
-
-FIXTURE_DIR = Path(__file__).parent.parent / Path("./raw_datas/")
-
-
-@pytest.fixture
-def location(tmp_path):
-    schema = tmp_path / "schema"
-    example_src = FIXTURE_DIR
-    assert example_src.exists()
-
-    shutil.copytree(src=example_src.absolute(), dst=schema.absolute())
-
-    return tmp_path
-
-
-def test_init(location):
-    repo = FSRepository("example", "example", location)
-    assert repo.ls() == [
-        "schema",
-    ]
-    assert repo.ls(dir="schema") == [
-        "username",
-        "recovery",
-        "salary",
-    ]
-
-    assert repo.ls(recursive=True) == [
-        "schema",
-        "schema/username",
-        "schema/recovery",
-        "schema/salary",
-        "schema/username/username.csv",
-        "schema/recovery/2022.csv",
-        "schema/recovery/2023.csv",
-        "schema/recovery/2024.csv",
-        "schema/salary/salary.pdf",
-    ]
-
-
-@pytest.fixture
-def repository(location) -> FSRepository:
-    return FSRepository("repo_id", "example", location)
-
-
-def test_list_schemas(repository):
-    assert repository.schemas() == ["repo_id-schema"]
-
-
-def test_describe_schema(location, repository):
-    schema = repository.schema("repo_id-schema")
-    assert schema.name == "schema"
-    assert schema.id == "repo_id-schema"
-    assert schema.repo_id == "repo_id"
-    assert schema.value == str(location / "schema")
-    assert schema.tables == [
-        "repo_id-schema-username",
-        "repo_id-schema-recovery",
-        "repo_id-schema-salary",
-    ]
-
-
-def test_list_tables_schema(repository):
-    assert repository.schema("repo_id-schema").tables == [
-        "repo_id-schema-username",
-        "repo_id-schema-recovery",
-        "repo_id-schema-salary",
-    ]
-    assert repository.tables("repo_id-schema") == [
-        "repo_id-schema-username",
-        "repo_id-schema-recovery",
-        "repo_id-schema-salary",
-    ]
-    assert repository.tables() == [
-        "repo_id-schema-username",
-        "repo_id-schema-recovery",
-        "repo_id-schema-salary",
-    ]
-
-
-def test_describe_table(location, repository):
-    table = repository.table("repo_id-schema-username")
-
-    assert table.id == "repo_id-schema-username"
-    assert table.repo_id == "repo_id"
-    assert table.schema_id == "repo_id-schema"
-    assert table.name == "username"
-    assert table.value == str(location / "schema" / "username")
-    assert table.partitions == ["username.csv"]
-    assert table.datas == [table.value + "/username.csv"]
-
-
-def test_describe_table_with_partitions(location, repository):
-    table = repository.table("repo_id-schema-recovery")
-
-    assert table.id == "repo_id-schema-recovery"
-    assert table.repo_id == "repo_id"
-    assert table.schema_id == "repo_id-schema"
-    assert table.name == "recovery"
-    assert table.value == str(location / "schema" / "recovery")
-    assert table.partitions == [
-        "2022.csv",
-        "2023.csv",
-        "2024.csv",
-    ]
-    assert table.datas == [
-        table.value + "/2022.csv",
-        table.value + "/2023.csv",
-        table.value + "/2024.csv",
-    ]
--- a/uv.lock
+++ b/uv.lock
@@ -1,7 +0,0 @@
-version = 1
-requires-python = ">=3.13"
-
-[[package]]
-name = "plesna"
-version = "0.1.0"
-source = { virtual = "." }
Author	SHA1	Message	Date
Bertrand Benjamin	e794242a03	Feat: test on pandas xlsx and ods file reader	2024-10-07 05:27:46 +02:00
Bertrand Benjamin	5450de8628	Feat: start testing fs_repository	2024-08-14 10:44:38 +02:00
Bertrand Benjamin	08c7fbe4c5	Feat: test consume_flux	2024-08-14 07:41:36 +02:00
Bertrand Benjamin	959b53e6a0	Feat: start flux	2024-08-14 07:22:01 +02:00
Bertrand Benjamin	91e229eab2	Feat: add __init__ and mod function signature	2024-08-14 07:21:36 +02:00
Bertrand Benjamin	2de0e5ef5c	refact: rename stage to repository	2024-08-07 11:39:33 +02:00
Bertrand Benjamin	7fb7bc6f5c	Feat: put table's callback and layout in factory	2024-07-28 18:49:07 +02:00
Bertrand Benjamin	612df0a8eb	Feat: callback to toggle editing	2024-07-28 17:34:56 +02:00
Bertrand Benjamin	74882ae572	Feat: add navigation	2024-07-28 12:29:14 +02:00
Bertrand Benjamin	d8f2fb52e1	feat: organise router path	2024-07-27 19:19:59 +02:00
Bertrand Benjamin	f9bfb917bd	feat: global design	2024-07-27 18:45:20 +02:00
Bertrand Benjamin	cdad13788a	feat: add tailwindcss	2024-07-27 18:22:00 +02:00
Bertrand Benjamin	29c82ae597	feat: add recursive schema	2024-07-27 17:50:29 +02:00
Bertrand Benjamin	5b53630688	feat: add schema and table listing	2024-07-27 17:39:09 +02:00
Bertrand Benjamin	ed6d1c87d1	feat: init dashboard	2024-07-27 15:55:20 +02:00