recopytex/recopytex/csv_extraction.py

#!/usr/bin/env python
# encoding: utf-8

""" Extracting data from xlsx files """

import pandas as pd
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE

pd.set_option("Precision", 2)

def try_replace(x, old, new):
    try:
        return str(x).replace(old, new)
    except ValueError:
        return x


def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Extract the list of students from df

    :param df: the dataframe
    :param no_student_columns: columns that are not students
    :return: list of students
    """
    students = df.columns.difference(no_student_columns)
    return students


def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat the ws for students

    :param df: the dataframe (one row per questions)
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    Columns of csv files:

    - NO_ST_COLUMNS meta data on questions
    - one for each students

    This function flat student's columns to "student" and "score"
    """
    students = extract_students(df, no_student_columns)
    scores = []
    for st in students:
        scores.append(
            pd.melt(
                df,
                id_vars=no_student_columns,
                value_vars=st,
                var_name=COLUMNS["student"],
                value_name=COLUMNS["score"],
            ).dropna(subset=[COLUMNS["score"]])
        )
    return pd.concat(scores)


def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat and clear the dataframe extracted from csv

    :param csv_df: data frame read from csv
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    """
    df = flat_df_students(csv_df)

    df[COLUMNS["question"]].fillna("", inplace=True)
    df[COLUMNS["exercise"]].fillna("", inplace=True)
    df[COLUMNS["comment"]].fillna("", inplace=True)
    df[COLUMNS["competence"]].fillna("", inplace=True)

    df[COLUMNS["score"]] = pd.to_numeric(
        df[COLUMNS["score"]]
        .replace(VALIDSCORE["NOANSWER"], -1)
        .apply(lambda x: try_replace(x, ",", "."))
    )
    df[COLUMNS["score_rate"]] = pd.to_numeric(
        df[COLUMNS["score_rate"]]
        .apply(lambda x: try_replace(x, ",", ".")),
        errors="coerce"
    )

    return df


# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del