recopytex/recopytex/csv_extraction.py

#!/usr/bin/env python
# encoding: utf-8

""" Extracting data from xlsx files """

import pandas as pd
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE

pd.set_option("Precision", 2)


def try_replace(x, old, new):
    try:
        return str(x).replace(old, new)
    except ValueError:
        return x


def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Extract the list of students from df 

    :param df: the dataframe
    :param no_student_columns: columns that are not students
    :return: list of students
    """
    students = df.columns.difference(no_student_columns)
    return students


def flat_df_students(
    df, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True
):
    """ Flat the dataframe by returning a dataframe with on student on each line

    :param df: the dataframe (one row per questions)
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    Columns of csv files:

    - NO_ST_COLUMNS meta data on questions
    - one for each students

    This function flat student's columns to "student" and "score"
    """
    students = extract_students(df, no_student_columns)
    scores = []
    for st in students:
        scores.append(
            pd.melt(
                df,
                id_vars=no_student_columns,
                value_vars=st,
                var_name=COLUMNS["student"],
                value_name=COLUMNS["score"],
            ).dropna(subset=[COLUMNS["score"]])
        )
    if postprocessing:
        return postprocess(pd.concat(scores))
    return pd.concat(scores)


def flat_df_for(
    df, student, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True
):
    """ Extract the data only for one student

    :param df: the dataframe (one row per questions)
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    Columns of csv files:

    - NO_ST_COLUMNS meta data on questions
    - one for each students

    """
    students = extract_students(df, no_student_columns)
    if student not in students:
        raise KeyError("This student is not in the table")
    st_df = df[list(no_student_columns) + [student]]
    st_df = st_df.rename(columns={student: COLUMNS["score"]}).dropna(
        subset=[COLUMNS["score"]]
    )
    if postprocessing:
        return postprocess(st_df)
    return st_df


def postprocess(df):
    """ Postprocessing score dataframe 

    - Replace na with an empty string
    - Replace "NOANSWER" with -1
    - Turn commas number to dot numbers
    """

    df[COLUMNS["question"]].fillna("", inplace=True)
    df[COLUMNS["exercise"]].fillna("", inplace=True)
    df[COLUMNS["comment"]].fillna("", inplace=True)
    df[COLUMNS["competence"]].fillna("", inplace=True)

    df[COLUMNS["score"]] = pd.to_numeric(
        df[COLUMNS["score"]]
        .replace(VALIDSCORE["NOANSWER"], -1)
        .apply(lambda x: try_replace(x, ",", "."))
    )
    df[COLUMNS["score_rate"]] = pd.to_numeric(
        df[COLUMNS["score_rate"]].apply(lambda x: try_replace(x, ",", ".")),
        errors="coerce",
    )

    return df


# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`#!/usr/bin/env python`
			`# encoding: utf-8`

			`""" Extracting data from xlsx files """`

			`import pandas as pd`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
			`pd.set_option("Precision", 2)`

Feat: Base for student exploration 2019-08-20 19:15:39 +00:00
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`def try_replace(x, old, new):`
			`try:`
			`return str(x).replace(old, new)`
			`except ValueError:`
			`return x`

Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`""" Extract the list of students from df`

			`:param df: the dataframe`
			`:param no_student_columns: columns that are not students`
			`:return: list of students`
			`"""`
			`students = df.columns.difference(no_student_columns)`
			`return students`


Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`def flat_df_students(`
			`df, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True`
			`):`
			`""" Flat the dataframe by returning a dataframe with on student on each line`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
			`:param df: the dataframe (one row per questions)`
			`:param no_student_columns: columns that are not students`
			`:return: dataframe with one row per questions and students`

			`Columns of csv files:`

Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`- NO_ST_COLUMNS meta data on questions`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`- one for each students`

			`This function flat student's columns to "student" and "score"`
			`"""`
			`students = extract_students(df, no_student_columns)`
			`scores = []`
			`for st in students:`
			`scores.append(`
			`pd.melt(`
			`df,`
			`id_vars=no_student_columns,`
			`value_vars=st,`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`var_name=COLUMNS["student"],`
			`value_name=COLUMNS["score"],`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`).dropna(subset=[COLUMNS["score"]])`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`)`
Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`if postprocessing:`
			`return postprocess(pd.concat(scores))`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`return pd.concat(scores)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00

Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`def flat_df_for(`
			`df, student, no_student_columns=NO_ST_COLUMNS.values(), postprocessing=True`
			`):`
			`""" Extract the data only for one student`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`:param df: the dataframe (one row per questions)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`:param no_student_columns: columns that are not students`
			`:return: dataframe with one row per questions and students`

Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`Columns of csv files:`

			`- NO_ST_COLUMNS meta data on questions`
			`- one for each students`

			`"""`
			`students = extract_students(df, no_student_columns)`
			`if student not in students:`
			`raise KeyError("This student is not in the table")`
			`st_df = df[list(no_student_columns) + [student]]`
			`st_df = st_df.rename(columns={student: COLUMNS["score"]}).dropna(`
			`subset=[COLUMNS["score"]]`
			`)`
			`if postprocessing:`
			`return postprocess(st_df)`
			`return st_df`


			`def postprocess(df):`
			`""" Postprocessing score dataframe`

			`- Replace na with an empty string`
			`- Replace "NOANSWER" with -1`
			`- Turn commas number to dot numbers`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`"""`

Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`df[COLUMNS["question"]].fillna("", inplace=True)`
			`df[COLUMNS["exercise"]].fillna("", inplace=True)`
			`df[COLUMNS["comment"]].fillna("", inplace=True)`
			`df[COLUMNS["competence"]].fillna("", inplace=True)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`df[COLUMNS["score"]] = pd.to_numeric(`
			`df[COLUMNS["score"]]`
			`.replace(VALIDSCORE["NOANSWER"], -1)`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`.apply(lambda x: try_replace(x, ",", "."))`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`)`
Fix: bug on score_rate parsing 2019-08-04 21:32:22 +00:00			`df[COLUMNS["score_rate"]] = pd.to_numeric(`
Feat: Base for student exploration 2019-08-20 19:15:39 +00:00			`df[COLUMNS["score_rate"]].apply(lambda x: try_replace(x, ",", ".")),`
			`errors="coerce",`
Fix: bug on score_rate parsing 2019-08-04 21:32:22 +00:00			`)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
			`return df`


			`# -----------------------------`
			`# Reglages pour 'vim'`
			`# vim:set autoindent expandtab tabstop=4 shiftwidth=4:`
			`# cursor: 16 del`