Feat: first analysis and plots

2019-08-04 23:24:32 +02:00
parent 2296615cb4
commit 358af2aabf
5 changed files with 561 additions and 125 deletions
--- a/recopytex/init.py
+++ b/recopytex/init.py
@@ -1,15 +1,5 @@
 #!/usr/bin/env python
 # encoding: utf-8

-NO_STUDENT_COLUMNS = [
-    "Trimestre",
-    "Nom",
-    "Date",
-    "Exercice",
-    "Question",
-    "Competence",
-    "Domaine",
-    "Commentaire",
-    "Bareme",
-    "Niveau",
-]
+from .csv_extraction import flat_clear_csv
+from .df_marks_manip import pp_q_scores
--- a/recopytex/config.py
+++ b/recopytex/config.py
@@ -0,0 +1,30 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+NO_ST_COLUMNS = {
+    "term": "Trimestre",
+    "assessment": "Nom",
+    "date": "Date",
+    "exercise": "Exercice",
+    "question": "Question",
+    "competence": "Competence",
+    "theme": "Domaine",
+    "comment": "Commentaire",
+    "score_rate": "Bareme",
+    "is_leveled": "Est_nivele",
+}
+
+COLUMNS = {
+    **NO_ST_COLUMNS,
+    "student": "Eleve",
+    "score": "Score",
+    "mark": "Note",
+    "level": "Niveau",
+    "normalized": "Normalise",
+}
+
+VALIDSCORE = {
+    "NOTFILLED": "",  # The item is not scored yet
+    "NOANSWER": ".",  # Student gives no answer (this score will impact the fianl mark)
+    "ABS": "a",  # Student has absent (this score won't be impact the final mark)
+}
--- a/recopytex/csv_extraction.py
+++ b/recopytex/csv_extraction.py
@@ -4,12 +4,12 @@
 """ Extracting data from xlsx files """

 import pandas as pd
-from . import NO_STUDENT_COLUMNS
+from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE

 pd.set_option("Precision", 2)


-def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Extract the list of students from df 

    :param df: the dataframe
@@ -20,7 +20,7 @@ def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
    return students


-def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat the ws for students

    :param df: the dataframe (one row per questions)
@@ -29,7 +29,7 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):

    Columns of csv files:

-    - NO_STUDENT_COLUMNS
+    - NO_ST_COLUMNS meta data on questions
    - one for each students

    This function flat student's columns to "student" and "score"
@@ -42,14 +42,14 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
                df,
                id_vars=no_student_columns,
                value_vars=st,
-                var_name="student",
-                value_name="score",
+                var_name=COLUMNS["student"],
+                value_name=COLUMNS["score"],
            )
        )
-    return pd.concat(scores)
+    return pd.concat(scores).dropna(subset=[COLUMNS["score"]])


-def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
+def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat and clear the dataframe extracted from csv

    :param csv_df: data frame read from csv
@@ -59,12 +59,16 @@ def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
    """
    df = flat_df_students(csv_df)

-    df.columns = df.columns.map(lambda x: x.lower())
+    df[COLUMNS["question"]].fillna("", inplace=True)
+    df[COLUMNS["exercise"]].fillna("", inplace=True)
+    df[COLUMNS["comment"]].fillna("", inplace=True)
+    df[COLUMNS["competence"]].fillna("", inplace=True)

-    df["question"].fillna("", inplace=True)
-    df["exercice"].fillna("", inplace=True)
-    df["commentaire"].fillna("", inplace=True)
-    df["competence"].fillna("", inplace=True)
+    df[COLUMNS["score"]] = pd.to_numeric(
+        df[COLUMNS["score"]]
+        .replace(VALIDSCORE["NOANSWER"], -1)
+        .apply(lambda x: str(x).replace(",", "."))
+    )

    return df

--- a/recopytex/df_marks_manip.py
+++ b/recopytex/df_marks_manip.py
@@ -0,0 +1,205 @@
+#!/usr/bin/env python
+# encoding: utf-8
+
+import pandas as pd
+import numpy as np
+from math import ceil, floor
+from .config import COLUMNS, VALIDSCORE
+
+# Values manipulations
+
+
+def round_half_point(val):
+    try:
+        return 0.5 * ceil(2.0 * val)
+    except ValueError:
+        return val
+    except TypeError:
+        return val
+
+
+def score_to_mark(x):
+    """ Compute the mark
+
+    if the item is leveled then the score is multiply by the score_rate
+    otherwise it copies the score
+
+    :param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
+
+    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
+    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
+    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
+    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
+    ...    }
+    >>> df = pd.DataFrame(d)
+    >>> score_to_mark(df.loc[0])
+    1.0
+    >>> score_to_mark(df.loc[10])
+    1.3333333333333333
+    """
+    # -1 is no answer
+    if x[COLUMNS["score"]] == -1:
+        return 0
+
+    if x[COLUMNS["is_leveled"]]:
+        if x[COLUMNS["score"]] not in [0, 1, 2, 3]:
+            raise ValueError(f"The evaluation is out of range: {x[COLUMNS['score']]} at {x}")
+        return x[COLUMNS["score"]] * x[COLUMNS["score_rate"]] / 3
+
+    if x[COLUMNS["score"]] > x[COLUMNS["score_rate"]]:
+        raise ValueError(
+            f"The score ({x['score']}) is greated than the rating scale ({x[COLUMNS['score_rate']]}) at {x}"
+        )
+    return x[COLUMNS["score"]]
+
+
+def score_to_level(x):
+    """ Compute the level (".",0,1,2,3).
+
+    :param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
+
+    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
+    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
+    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
+    ...    COLUMNS["score"]:[1, 0.33, np.nan, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
+    ...    }
+    >>> df = pd.DataFrame(d)
+    >>> score_to_level(df.loc[0])
+    3
+    >>> score_to_level(df.loc[1])
+    1
+    >>> score_to_level(df.loc[2])
+    'na'
+    >>> score_to_level(df.loc[3])
+    3
+    >>> score_to_level(df.loc[5])
+    3
+    >>> score_to_level(df.loc[10])
+    2
+    """
+    # -1 is no answer
+    if x[COLUMNS["score"]] == -1:
+        return x[COLUMNS["score"]]
+
+    if x[COLUMNS["is_leveled"]]:
+        return int(x[COLUMNS["score"]])
+    else:
+        return int(ceil(x[COLUMNS["score"]] / x[COLUMNS["score_rate"]] * 3))
+
+
+# DataFrame columns manipulations
+
+
+def compute_mark(df):
+    """ Add Mark column to df
+
+    :param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
+
+    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
+    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
+    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
+    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
+    ...    }
+    >>> df = pd.DataFrame(d)
+    >>> compute_mark(df)
+    0     1.00
+    1     0.33
+    2     2.00
+    3     1.50
+    4     0.67
+    5     2.00
+    6     0.67
+    7     1.00
+    8     1.50
+    9     1.00
+    10    1.33
+    11    2.00
+    dtype: float64
+    """
+    return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
+        score_to_mark, axis=1
+    )
+
+
+def compute_level(df):
+    """ Add Mark column to df
+
+    :param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
+
+    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
+    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
+    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
+    ...    COLUMNS["score"]:[np.nan, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
+    ...    }
+    >>> df = pd.DataFrame(d)
+    >>> compute_level(df)
+    0     na
+    1      1
+    2      3
+    3      3
+    4      1
+    5      3
+    6      2
+    7      3
+    8      3
+    9      2
+    10     2
+    11     3
+    dtype: object
+    """
+    return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
+        score_to_level, axis=1
+    )
+
+
+def compute_normalized(df):
+    """ Compute the normalized mark (Mark / score_rate)
+
+    :param df: DataFrame with "Mark" and COLUMNS["score_rate"] columns
+
+    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
+    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
+    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
+    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
+    ...    }
+    >>> df = pd.DataFrame(d)
+    >>> df["Mark"] = compute_marks(df)
+    >>> compute_normalized(df)
+    0     1.00
+    1     0.33
+    2     1.00
+    3     0.75
+    4     0.33
+    5     1.00
+    6     0.67
+    7     1.00
+    8     0.75
+    9     0.50
+    10    0.67
+    11    1.00
+    dtype: float64
+    """
+    return df[COLUMNS["mark"]] / df[COLUMNS["score_rate"]]
+
+
+# Postprocessing question scores
+
+
+def pp_q_scores(df):
+    """ Postprocessing questions scores dataframe
+
+    :param df: questions-scores dataframe
+    :return: same data frame with mark, level and normalize columns
+    """
+    assign = {
+        COLUMNS["mark"]: compute_mark,
+        COLUMNS["level"]: compute_level,
+        COLUMNS["normalized"]: compute_normalized,
+    }
+    return df.assign(**assign)
+
+
+# -----------------------------
+# Reglages pour 'vim'
+# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
+# cursor: 16 del
--- a/templates/tpl_evaluation.ipynb
+++ b/templates/tpl_evaluation.ipynb