Feat: first analysis and plots

2019-08-04 23:24:32 +02:00 · 2019-08-04 23:24:32 +02:00 · 358af2aabf
commit 358af2aabf
parent 2296615cb4
5 changed files with 561 additions and 125 deletions
--- a/recopytex/init.py
+++ b/recopytex/init.py
@ -1,15 +1,5 @@
 #!/usr/bin/env python
 # encoding: utf-8
-NO_STUDENT_COLUMNS = [
+from .csv_extraction import flat_clear_csv
-    "Trimestre",
+from .df_marks_manip import pp_q_scores
    "Nom",
    "Date",
    "Exercice",
    "Question",
    "Competence",
    "Domaine",
    "Commentaire",
    "Bareme",
    "Niveau",
 ]
--- a/recopytex/config.py
+++ b/recopytex/config.py
@ -0,0 +1,30 @@
 #!/usr/bin/env python
 # encoding: utf-8
 NO_ST_COLUMNS = {
    "term": "Trimestre",
    "assessment": "Nom",
    "date": "Date",
    "exercise": "Exercice",
    "question": "Question",
    "competence": "Competence",
    "theme": "Domaine",
    "comment": "Commentaire",
    "score_rate": "Bareme",
    "is_leveled": "Est_nivele",
 }
 COLUMNS = {
    **NO_ST_COLUMNS,
    "student": "Eleve",
    "score": "Score",
    "mark": "Note",
    "level": "Niveau",
    "normalized": "Normalise",
 }
 VALIDSCORE = {
    "NOTFILLED": "",  # The item is not scored yet
    "NOANSWER": ".",  # Student gives no answer (this score will impact the fianl mark)
    "ABS": "a",  # Student has absent (this score won't be impact the final mark)
 }
--- a/recopytex/csv_extraction.py
+++ b/recopytex/csv_extraction.py
@ -4,12 +4,12 @@
 """ Extracting data from xlsx files """
 import pandas as pd
-from . import NO_STUDENT_COLUMNS
+from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE
 pd.set_option("Precision", 2)
-def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Extract the list of students from df 
    :param df: the dataframe
@ -20,7 +20,7 @@ def extract_students(df, no_student_columns=NO_STUDENT_COLUMNS):
    return students
-def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
+def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat the ws for students
    :param df: the dataframe (one row per questions)
@ -29,7 +29,7 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
    Columns of csv files:
-    - NO_STUDENT_COLUMNS
+    - NO_ST_COLUMNS meta data on questions
    - one for each students
    This function flat student's columns to "student" and "score"
@ -42,14 +42,14 @@ def flat_df_students(df, no_student_columns=NO_STUDENT_COLUMNS):
                df,
                id_vars=no_student_columns,
                value_vars=st,
-                var_name="student",
+                var_name=COLUMNS["student"],
-                value_name="score",
+                value_name=COLUMNS["score"],
            )
        )
-    return pd.concat(scores)
+    return pd.concat(scores).dropna(subset=[COLUMNS["score"]])
-def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
+def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat and clear the dataframe extracted from csv
    :param csv_df: data frame read from csv
@ -59,12 +59,16 @@ def flat_clear_csv(csv_df, no_student_columns=NO_STUDENT_COLUMNS):
    """
    df = flat_df_students(csv_df)
-    df.columns = df.columns.map(lambda x: x.lower())
+    df[COLUMNS["question"]].fillna("", inplace=True)
    df[COLUMNS["exercise"]].fillna("", inplace=True)
    df[COLUMNS["comment"]].fillna("", inplace=True)
    df[COLUMNS["competence"]].fillna("", inplace=True)
-    df["question"].fillna("", inplace=True)
+    df[COLUMNS["score"]] = pd.to_numeric(
-    df["exercice"].fillna("", inplace=True)
+        df[COLUMNS["score"]]
-    df["commentaire"].fillna("", inplace=True)
+        .replace(VALIDSCORE["NOANSWER"], -1)
-    df["competence"].fillna("", inplace=True)
+        .apply(lambda x: str(x).replace(",", "."))
    )
    return df
--- a/recopytex/df_marks_manip.py
+++ b/recopytex/df_marks_manip.py
@ -0,0 +1,205 @@
 #!/usr/bin/env python
 # encoding: utf-8
 import pandas as pd
 import numpy as np
 from math import ceil, floor
 from .config import COLUMNS, VALIDSCORE
 # Values manipulations
 def round_half_point(val):
    try:
        return 0.5 * ceil(2.0 * val)
    except ValueError:
        return val
    except TypeError:
        return val
 def score_to_mark(x):
    """ Compute the mark
    if the item is leveled then the score is multiply by the score_rate
    otherwise it copies the score
    :param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> score_to_mark(df.loc[0])
    1.0
    >>> score_to_mark(df.loc[10])
    1.3333333333333333
    """
    # -1 is no answer
    if x[COLUMNS["score"]] == -1:
        return 0
    if x[COLUMNS["is_leveled"]]:
        if x[COLUMNS["score"]] not in [0, 1, 2, 3]:
            raise ValueError(f"The evaluation is out of range: {x[COLUMNS['score']]} at {x}")
        return x[COLUMNS["score"]] * x[COLUMNS["score_rate"]] / 3
    if x[COLUMNS["score"]] > x[COLUMNS["score_rate"]]:
        raise ValueError(
            f"The score ({x['score']}) is greated than the rating scale ({x[COLUMNS['score_rate']]}) at {x}"
        )
    return x[COLUMNS["score"]]
 def score_to_level(x):
    """ Compute the level (".",0,1,2,3).
    :param x: dictionnary with COLUMNS["is_leveled"], COLUMNS["score"] and COLUMNS["score_rate"] keys
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    COLUMNS["score"]:[1, 0.33, np.nan, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> score_to_level(df.loc[0])
    3
    >>> score_to_level(df.loc[1])
    1
    >>> score_to_level(df.loc[2])
    'na'
    >>> score_to_level(df.loc[3])
    3
    >>> score_to_level(df.loc[5])
    3
    >>> score_to_level(df.loc[10])
    2
    """
    # -1 is no answer
    if x[COLUMNS["score"]] == -1:
        return x[COLUMNS["score"]]
    if x[COLUMNS["is_leveled"]]:
        return int(x[COLUMNS["score"]])
    else:
        return int(ceil(x[COLUMNS["score"]] / x[COLUMNS["score_rate"]] * 3))
 # DataFrame columns manipulations
 def compute_mark(df):
    """ Add Mark column to df
    :param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> compute_mark(df)
    0     1.00
    1     0.33
    2     2.00
    3     1.50
    4     0.67
    5     2.00
    6     0.67
    7     1.00
    8     1.50
    9     1.00
    10    1.33
    11    2.00
    dtype: float64
    """
    return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
        score_to_mark, axis=1
    )
 def compute_level(df):
    """ Add Mark column to df
    :param df: DataFrame with COLUMNS["score"], COLUMNS["is_leveled"] and COLUMNS["score_rate"] columns.
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    COLUMNS["score"]:[np.nan, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> compute_level(df)
    0     na
    1      1
    2      3
    3      3
    4      1
    5      3
    6      2
    7      3
    8      3
    9      2
    10     2
    11     3
    dtype: object
    """
    return df[[COLUMNS["score"], COLUMNS["is_leveled"], COLUMNS["score_rate"]]].apply(
        score_to_level, axis=1
    )
 def compute_normalized(df):
    """ Compute the normalized mark (Mark / score_rate)
    :param df: DataFrame with "Mark" and COLUMNS["score_rate"] columns
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    COLUMNS["score_rate"]:[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    COLUMNS["is_leveled"]:[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    COLUMNS["score"]:[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> df["Mark"] = compute_marks(df)
    >>> compute_normalized(df)
    0     1.00
    1     0.33
    2     1.00
    3     0.75
    4     0.33
    5     1.00
    6     0.67
    7     1.00
    8     0.75
    9     0.50
    10    0.67
    11    1.00
    dtype: float64
    """
    return df[COLUMNS["mark"]] / df[COLUMNS["score_rate"]]
 # Postprocessing question scores
 def pp_q_scores(df):
    """ Postprocessing questions scores dataframe
    :param df: questions-scores dataframe
    :return: same data frame with mark, level and normalize columns
    """
    assign = {
        COLUMNS["mark"]: compute_mark,
        COLUMNS["level"]: compute_level,
        COLUMNS["normalized"]: compute_normalized,
    }
    return df.assign(**assign)
 # -----------------------------
 # Reglages pour 'vim'
 # vim:set autoindent expandtab tabstop=4 shiftwidth=4:
 # cursor: 16 del
--- a/templates/tpl_evaluation.ipynb
+++ b/templates/tpl_evaluation.ipynb