repytex/notes_tools/tools/df_marks_manip.py

#!/usr/bin/env python
# encoding: utf-8

import pandas as pd
import numpy as np
from math import ceil
import logging
logger = logging.getLogger(__name__)

# Values manipulations

def round_half_point(val):
    try:
        return 0.5 * ceil(2.0 * val)
    except ValueError:
        return val
    except TypeError:
        return val

latex_caract = ["\\NoRep", "\\RepZ", "\\RepU", "\\RepD", "\\RepT"]
def note_to_rep(x):
    r""" Transform a Note to the latex caracter

    :param x: dictionnary with "Niveau" and "Note" keys

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.67, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> note_to_rep(df.loc[0])
    1.0
    >>> note_to_rep(df.loc[4])
    '\\RepU'
    """
    if x["Niveau"]:
        if pd.isnull(x["Note"]):
            return latex_caract[0]
        elif x["Note"] in range(4):
            return latex_caract[int(x["Note"])+1]
    return x["Note"]

def note_to_mark(x):
    """ Compute the mark when it is a "Nivea" note

    :param x: dictionnary with "Niveau", "Note" and "Bareme" keys

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> note_to_mark(df.loc[0])
    1.0
    >>> note_to_mark(df.loc[10])
    1.3333333333333333
    """

    if x["Niveau"]:
        return x["Note"] * x["Bareme"] / 3
    return x["Note"]

def question_uniq_formater(row):
    """ Create a kind of unique description of the question

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> question_uniq_formater(df.loc[0])
    'Ex1 Q1'
    >>> question_uniq_formater(df.loc[10])
    'Ex1 Q1'

    """
    ans = ""
    try:
        int(row['Exercice'])
    except ValueError:
        ans += str(row["Exercice"])
    else:
        ans += "Exo"+str(row["Exercice"])

    ans += " "

    try:
        int(row["Question"])
    except ValueError:
        if not pd.isnull(row["Question"]):
            ans += str(row["Question"])
    else:
        ans += "Qu"+str(row["Question"])

    try:
        row["Commentaire"]
    except KeyError:
        pass
    else:
        if not pd.isnull(row["Commentaire"]):
            ans += " ({})".format(row["Commentaire"])
    return ans

# DataFrame columns manipulations

def compute_marks(df):
    """ Add Mark column to df

    :param df: DataFrame with "Note", "Niveau" and "Bareme" columns.

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> compute_marks(df)
    0     1.00
    1     0.33
    2     2.00
    3     1.50
    4     0.67
    5     2.00
    6     0.67
    7     1.00
    8     1.50
    9     1.00
    10    1.33
    11    2.00
    dtype: float64
    """
    return df[["Note", "Niveau", "Bareme"]].apply(note_to_mark, axis=1)

def compute_latex_rep(df):
    """ Add Latex_rep column to df

    :param df: DataFrame with "Note" and "Niveau" columns.

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> compute_latex_rep(df)
    0         1
    1      0.33
    2         2
    3       1.5
    4     \RepU
    5     \RepT
    6      0.67
    7         1
    8       1.5
    9         1
    10    \RepD
    11    \RepT
    dtype: object
    """
    return df[["Note", "Niveau"]].apply(note_to_rep, axis=1).fillna("??")

def compute_normalized(df):
    """ Compute the normalized mark (Mark / Bareme)

    :param df: DataFrame with "Mark" and "Bareme" columns

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.666, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> df["Mark"] = compute_marks(df)
    >>> compute_normalized(df)
    0     1.00
    1     0.33
    2     1.00
    3     0.75
    4     0.33
    5     1.00
    6     0.67
    7     1.00
    8     0.75
    9     0.50
    10    0.67
    11    1.00
    dtype: float64
    """
    return df["Mark"] / df["Bareme"]

def compute_question_description(df):
    """ Compute the unique description of a question """
    return df.apply(question_uniq_formater, axis = 1)

# Computing custom values

def compute_exo_marks(df):
    """ Compute Exercice level marks

    :param df: the original marks
    :returns: DataFrame with computed marks

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.67, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> df["Mark"] = compute_marks(df)
    >>> compute_exo_marks(df)
      Eleve Nom Exercice        Date Trimestre  Bareme  Mark Question  Niveau
    0    E1  N1      Ex1  16/09/2016         1     2.0   1.5    Total       0
    1    E1  N1      Ex2  16/09/2016         1     4.0   3.5    Total       0
    2    E1  N2      Ex1  01/10/2016         1     2.0   1.0    Total       0
    3    E1  N2      Ex2  01/10/2016         1     2.0   2.0    Total       0
    4    E2  N1      Ex1  16/09/2016         1     2.0   2.0    Total       0
    5    E2  N1      Ex2  16/09/2016         1     4.0   2.5    Total       0
    6    E2  N2      Ex1  01/10/2016         1     2.0   1.5    Total       0
    7    E2  N2      Ex2  01/10/2016         1     2.0   2.0    Total       0


    """
    exo_pt = pd.pivot_table(df,
              index = [ "Eleve", "Nom", "Exercice", "Date", "Trimestre"],
              values = ["Bareme", "Mark"],
              aggfunc=np.sum,
              ).applymap(round_half_point)

    exo = exo_pt.reset_index()
    exo["Question"] = "Total"
    exo["Niveau"] = 0
    return exo

def compute_eval_marks(df):
    """ Compute Nom level marks from the dataframe using only row with Total in Question

    :param df: DataFrame with value Total in Question column
    :returns: DataFrame with evaluation marks

    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   0.67, 1, 1.5, 1, 2, 3],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> df["Mark"] = compute_marks(df)
    >>> df_exo = compute_exo_marks(df)
    >>> compute_eval_marks(df_exo)
       index Eleve Nom Trimestre  Bareme        Date  Mark
    0      0    E1  N1         1     6.0  16/09/2016   5.0
    1      1    E2  N1         1     6.0  16/09/2016   4.5
    2      0    E1  N2         1     4.0  01/10/2016   3.0
    3      1    E2  N2         1     4.0  01/10/2016   3.5


    """
    def date_format(dates):
        date_l = list(dates.unique())
        if len(date_l) == 1:
            return date_l[0]
        else:
            return "Trimestre"

    eval_m = pd.DataFrame()
    for eval_name in df["Nom"].unique():
        logger.debug(f"Compute marks for {eval_name}")
        eval_df = df[df["Nom"] == eval_name]
        dates = eval_df["Date"].unique()
        logger.debug(f"Find those dates: {dates}")
        if len(dates) > 1 or dates[0] == "Trimestre":
            # Les devoirs sur la durée, les NaN ne sont pas pénalisants
            # On les enlèves
            eval_df = eval_df.dropna(subset=["Mark"])
            dates = ["Trimestre"]

        eval_pt = pd.pivot_table(eval_df,
                index = [ "Eleve", "Nom", "Trimestre"],
                values = ["Bareme", "Mark", "Date"],
                aggfunc={"Bareme": np.sum, "Mark": np.sum, "Date":lambda x:dates[0]},
                )
        eval_pt = eval_pt.reset_index()
        eval_m = pd.concat([eval_m, eval_pt])

    eval_m = eval_m.reset_index()

    return eval_m

def digest_flat_df(flat_df):
    r""" Compute necessary element to make a flat df usable for analysis.

    >>> from numpy import nan
    >>> d = {"Eleve":["E1"]*6 + ["E2"]*6,
    ...    "Nom": ["N1"]*4+["N2"]*2 + ["N1"]*4+["N2"]*2,
    ...    "Exercice":["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"] + ["Ex1"]*2+["Ex2"]*2+["Ex1"]+["Ex2"],
    ...    "Question":["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"] + ["Q1"]+["Q2"]+["Q1"]+["Q2"]+["Q1"]+["Q1"],
    ...    "Date":["16/09/2016"]*4+["01/10/2016"]*2 + ["16/09/2016"]*4+["01/10/2016"]*2,
    ...    "Trimestre": ["1"]*12,
    ...    "Bareme":[1]*2+[2]*2+[2]*2 + [1]*2+[2]*2+[2]*2,
    ...    "Niveau":[0]*4+[1]*2 + [0]*4+[1]*2,
    ...    "Note":[1, 0.33, 2, 1.5, 1, 3,   nan, 0, 0, nan, nan, nan],
    ...    }
    >>> df = pd.DataFrame(d)
    >>> quest_df, exo_df, eval_df = digest_flat_df(df)
    >>> quest_df[['Eleve', "Nom", "Mark", "Latex_rep", "Normalized", "Uniq_quest"]]
       Eleve Nom  Mark Latex_rep  Normalized Uniq_quest
    0     E1  N1  1.00         1        1.00     Ex1 Q1
    1     E1  N1  0.33      0.33        0.33     Ex1 Q2
    2     E1  N1  2.00         2        1.00     Ex2 Q1
    3     E1  N1  1.50       1.5        0.75     Ex2 Q2
    4     E1  N2  0.67     \RepU        0.33     Ex1 Q1
    5     E1  N2  2.00     \RepT        1.00     Ex2 Q1
    6     E2  N1   NaN        ??         NaN     Ex1 Q1
    7     E2  N1  0.00         0        0.00     Ex1 Q2
    8     E2  N1  0.00         0        0.00     Ex2 Q1
    9     E2  N1   NaN        ??         NaN     Ex2 Q2
    10    E2  N2   NaN    \NoRep         NaN     Ex1 Q1
    11    E2  N2   NaN    \NoRep         NaN     Ex2 Q1
    >>> exo_df[['Eleve', "Nom", "Exercice", "Mark", "Normalized"]]
      Eleve Nom Exercice  Mark  Normalized
    0    E1  N1      Ex1   1.5        0.75
    1    E1  N1      Ex2   3.5        0.88
    2    E1  N2      Ex1   1.0        0.50
    3    E1  N2      Ex2   2.0        1.00
    4    E2  N1      Ex1   0.0        0.00
    5    E2  N1      Ex2   0.0        0.00
    6    E2  N2      Ex1   NaN         NaN
    7    E2  N2      Ex2   NaN         NaN
    >>> eval_df
       index Eleve Nom Trimestre  Bareme        Date  Mark  Normalized
    0      0    E1  N1         1     6.0  16/09/2016   5.0        0.83
    1      1    E2  N1         1     6.0  16/09/2016   0.0        0.00
    2      0    E1  N2         1     4.0  01/10/2016   3.0        0.75
    3      1    E2  N2         1     4.0  01/10/2016   NaN         NaN
    """
    # Remove data with "nn" (non notés)
    df = flat_df.copy()[flat_df["Note"] != "nn"]
    df["Mark"] = compute_marks(df)
    df["Latex_rep"] = compute_latex_rep(df)
    df["Normalized"] = compute_normalized(df)
    df["Uniq_quest"] = compute_question_description(df)

    exo_df = compute_exo_marks(df)
    exo_df["Normalized"] = compute_normalized(exo_df)
    eval_df = compute_eval_marks(exo_df)
    eval_df["Normalized"] = compute_normalized(eval_df)

    return df, exo_df, eval_df


# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del