recopytex/recopytex/csv_extraction.py

#!/usr/bin/env python
# encoding: utf-8

""" Extracting data from xlsx files """

import pandas as pd
from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE

pd.set_option("Precision", 2)

def try_replace(x, old, new):
    try:
        return str(x).replace(old, new)
    except ValueError:
        return x


def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Extract the list of students from df 

    :param df: the dataframe
    :param no_student_columns: columns that are not students
    :return: list of students
    """
    students = df.columns.difference(no_student_columns)
    return students


def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat the ws for students

    :param df: the dataframe (one row per questions)
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    Columns of csv files:

    - NO_ST_COLUMNS meta data on questions
    - one for each students

    This function flat student's columns to "student" and "score"
    """
    students = extract_students(df, no_student_columns)
    scores = []
    for st in students:
        scores.append(
            pd.melt(
                df,
                id_vars=no_student_columns,
                value_vars=st,
                var_name=COLUMNS["student"],
                value_name=COLUMNS["score"],
            ).dropna(subset=[COLUMNS["score"]])
        )
    return pd.concat(scores)


def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):
    """ Flat and clear the dataframe extracted from csv

    :param csv_df: data frame read from csv
    :param no_student_columns: columns that are not students
    :return: dataframe with one row per questions and students

    """
    df = flat_df_students(csv_df)

    df[COLUMNS["question"]].fillna("", inplace=True)
    df[COLUMNS["exercise"]].fillna("", inplace=True)
    df[COLUMNS["comment"]].fillna("", inplace=True)
    df[COLUMNS["competence"]].fillna("", inplace=True)

    df[COLUMNS["score"]] = pd.to_numeric(
        df[COLUMNS["score"]]
        .replace(VALIDSCORE["NOANSWER"], -1)
        .apply(lambda x: try_replace(x, ",", "."))
    )
    df[COLUMNS["score_rate"]] = pd.to_numeric(
        df[COLUMNS["score_rate"]]
        .apply(lambda x: try_replace(x, ",", ".")),
        errors="coerce"
    )

    return df


# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
# cursor: 16 del
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`#!/usr/bin/env python`
			`# encoding: utf-8`

			`""" Extracting data from xlsx files """`

			`import pandas as pd`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`from .config import NO_ST_COLUMNS, COLUMNS, VALIDSCORE`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
			`pd.set_option("Precision", 2)`

Feat: Minor thing 2019-08-06 05:02:07 +00:00			`def try_replace(x, old, new):`
			`try:`
			`return str(x).replace(old, new)`
			`except ValueError:`
			`return x`

Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`def extract_students(df, no_student_columns=NO_ST_COLUMNS.values()):`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`""" Extract the list of students from df`

			`:param df: the dataframe`
			`:param no_student_columns: columns that are not students`
			`:return: list of students`
			`"""`
			`students = df.columns.difference(no_student_columns)`
			`return students`


Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`def flat_df_students(df, no_student_columns=NO_ST_COLUMNS.values()):`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`""" Flat the ws for students`

			`:param df: the dataframe (one row per questions)`
			`:param no_student_columns: columns that are not students`
			`:return: dataframe with one row per questions and students`

			`Columns of csv files:`

Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`- NO_ST_COLUMNS meta data on questions`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`- one for each students`

			`This function flat student's columns to "student" and "score"`
			`"""`
			`students = extract_students(df, no_student_columns)`
			`scores = []`
			`for st in students:`
			`scores.append(`
			`pd.melt(`
			`df,`
			`id_vars=no_student_columns,`
			`value_vars=st,`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`var_name=COLUMNS["student"],`
			`value_name=COLUMNS["score"],`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`).dropna(subset=[COLUMNS["score"]])`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`)`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`return pd.concat(scores)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00

Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`def flat_clear_csv(csv_df, no_student_columns=NO_ST_COLUMNS.values()):`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00			`""" Flat and clear the dataframe extracted from csv`

			`:param csv_df: data frame read from csv`
			`:param no_student_columns: columns that are not students`
			`:return: dataframe with one row per questions and students`

			`"""`
			`df = flat_df_students(csv_df)`

Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`df[COLUMNS["question"]].fillna("", inplace=True)`
			`df[COLUMNS["exercise"]].fillna("", inplace=True)`
			`df[COLUMNS["comment"]].fillna("", inplace=True)`
			`df[COLUMNS["competence"]].fillna("", inplace=True)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`df[COLUMNS["score"]] = pd.to_numeric(`
			`df[COLUMNS["score"]]`
			`.replace(VALIDSCORE["NOANSWER"], -1)`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`.apply(lambda x: try_replace(x, ",", "."))`
Feat: first analysis and plots 2019-08-04 21:24:32 +00:00			`)`
Fix: bug on score_rate parsing 2019-08-04 21:32:22 +00:00			`df[COLUMNS["score_rate"]] = pd.to_numeric(`
			`df[COLUMNS["score_rate"]]`
Feat: Minor thing 2019-08-06 05:02:07 +00:00			`.apply(lambda x: try_replace(x, ",", ".")),`
			`errors="coerce"`
Fix: bug on score_rate parsing 2019-08-04 21:32:22 +00:00			`)`
Feat: csv extraction and flattening 2019-08-04 19:57:27 +00:00
			`return df`


			`# -----------------------------`
			`# Reglages pour 'vim'`
			`# vim:set autoindent expandtab tabstop=4 shiftwidth=4:`
			`# cursor: 16 del`