Mapytex/mapytex/stat/dataset.py

208 lines
5.2 KiB
Python
Raw Permalink Normal View History

2019-10-13 19:01:31 +00:00
# /usr/bin/env python
2016-01-08 13:58:21 +00:00
# -*- coding:Utf-8 -*-
#
#
# Ensemble de fonction rendant beaucoup plus pratique la résolution et l'élaboration des exercices de stat au lycée
#
#
2016-02-13 04:04:08 +00:00
# TODO: Rendre toutes les réponses Explicable!! |mar. janv. 12 09:41:00
# EAT 2016
2016-01-08 14:01:39 +00:00
from math import sqrt, ceil
2016-01-09 15:40:02 +00:00
from .number_tools import number_factory
2016-01-09 15:51:20 +00:00
from .random_generator import random_generator
2016-01-08 13:58:21 +00:00
2016-02-13 04:04:08 +00:00
2016-01-08 14:01:39 +00:00
class Dataset(list):
2016-01-09 09:22:32 +00:00
""" A dataset (a list) with statistics and latex rendering methods
2016-02-13 03:29:26 +00:00
2016-01-09 09:22:32 +00:00
>>> s = Dataset(range(100))
>>> s.sum()
4950
>>> s.mean()
49.5
>>> s.deviation()
2016-01-09 15:40:02 +00:00
83325
2016-01-09 09:22:32 +00:00
>>> s.variance()
833.25
>>> s.sd()
2016-01-09 15:40:02 +00:00
28.87
2016-01-09 09:22:32 +00:00
"""
2016-01-09 15:14:18 +00:00
@classmethod
2019-10-13 19:01:31 +00:00
def random(
cls,
length,
data_name="Valeurs",
distrib="gauss",
rd_args=(0, 1),
nbr_format=lambda x: round(x, 2),
v_min=None,
v_max=None,
exact_mean=None,
):
2016-01-09 15:51:20 +00:00
""" Generate a random list of value
2016-01-09 15:14:18 +00:00
:param length: length of the dataset
2016-01-12 08:14:44 +00:00
:param distrib: Distribution of the data set. It can be a function or string from ["randint", "uniform", "gauss", "choice"]
2016-01-09 15:14:18 +00:00
:param rd_args: arguments to pass to distrib
:param nbr_format: function which format value
:param v_min: minimum accepted value
:param v_max: maximum accepted value
:param exact_mean: if set, the last generated number will be create in order that the computed mean is exacly equal to "exact_mean"
"""
2019-10-13 19:01:31 +00:00
data = random_generator(
length, distrib, rd_args, nbr_format, v_min, v_max, exact_mean
)
2016-01-09 15:14:18 +00:00
2016-02-13 04:04:08 +00:00
return cls(data, data_name=data_name)
2016-02-13 03:29:26 +00:00
2016-02-13 04:04:08 +00:00
def __init__(self, data=[], data_name="Valeurs"):
2016-02-13 03:29:26 +00:00
"""
2016-01-08 13:58:21 +00:00
Create a numeric data set
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
:param data: values of the data set
:param data_name: name of the data set
"""
2016-01-08 14:01:39 +00:00
list.__init__(self, data)
2016-01-08 13:58:21 +00:00
2016-01-08 14:01:39 +00:00
self_name = data_name
2016-01-08 13:58:21 +00:00
def add_data(self, data):
"""Add datas to the data set
:param data: datas
"""
try:
2016-01-08 14:01:39 +00:00
self += data
2016-01-08 13:58:21 +00:00
except TypeError:
2016-01-08 14:01:39 +00:00
self += [data]
2016-01-08 13:58:21 +00:00
2016-01-09 15:14:18 +00:00
# --------------------------
# Stat tools
2016-01-09 09:22:32 +00:00
def effectif_total(self):
return len(self)
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def sum(self):
2016-01-08 14:01:39 +00:00
return sum(self)
2016-02-13 03:29:26 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def mean(self):
2016-02-13 04:04:08 +00:00
return self.sum() / self.effectif_total()
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def deviation(self):
""" Compute the deviation (not normalized) """
mean = self.mean()
2019-10-13 19:01:31 +00:00
return sum([(x - mean) ** 2 for x in self])
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def variance(self):
2016-02-13 04:04:08 +00:00
return self.deviation() / self.effectif_total()
2016-02-13 03:29:26 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-01-08 13:58:21 +00:00
def sd(self):
""" Compute the standard deviation """
return sqrt(self.variance())
def quartiles(self):
"""
2016-01-09 09:22:32 +00:00
Calcul les quartiles de la série.
:return: un tuple avec (min, Q1, Me, Q3, Max)
2016-01-08 13:58:21 +00:00
2016-01-09 09:22:32 +00:00
>>> w = Dataset(range(12))
>>> w.quartiles()
(0, 2.5, 5.5, 8.5, 11)
2016-01-08 13:58:21 +00:00
"""
2016-02-13 04:04:08 +00:00
return (
min(self),
self.quartile(1),
self.quartile(2),
self.quartile(3),
2019-10-13 19:01:31 +00:00
max(self),
)
2016-01-08 13:58:21 +00:00
2016-01-09 15:40:02 +00:00
@number_factory
2016-02-13 04:04:08 +00:00
def quartile(self, quartile=1):
2016-01-08 13:58:21 +00:00
"""
Calcul un quartile de la série.
:param quartile: quartile à calculer (par defaut 1 -> Q1)
:return: le quartile demandé
: Example:
2016-01-09 09:22:32 +00:00
>>> w = Dataset(range(12))
>>> w.quartile(1)
2.5
>>> w.quartile(2)
5.5
>>> w.quartile(3)
8.5
>>> w = Dataset(range(14))
>>> w.quartile(1)
3
>>> w.quartile(2)
6.5
>>> w.quartile(3)
10
2016-01-08 13:58:21 +00:00
"""
2016-01-09 09:22:32 +00:00
# -1 to match with list indexing
position = self.posi_quartile(quartile) - 1
if position.is_integer():
2016-02-13 04:04:08 +00:00
return (self[int(position)] + self[int(position) + 1]) / 2
2016-01-09 09:22:32 +00:00
else:
return self[ceil(position)]
2016-01-08 13:58:21 +00:00
2016-02-13 04:04:08 +00:00
def posi_quartile(self, quartile=1):
2016-02-13 03:29:26 +00:00
"""
2016-01-08 13:58:21 +00:00
Calcul la position du quartile
:param quartile: le quartile concerné
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
:return : la position du quartile (arondis à l'entier suppérieur, non arrondis)
"""
2016-01-09 09:22:32 +00:00
return quartile * self.effectif_total() / 4
2016-02-13 03:29:26 +00:00
2016-01-08 13:58:21 +00:00
# --------------------------
# Rendu latex
2016-02-13 03:29:26 +00:00
2016-02-13 04:04:08 +00:00
def tabular_latex(self, nbr_lines=1):
""" Latex code to display dataset as a tabular """
2016-01-09 09:22:32 +00:00
d_per_line = self.effectif_total() // nbr_lines
d_last_line = self.effectif_total() % d_per_line
2019-10-13 19:01:31 +00:00
splited_data = [
self[x : x + d_per_line]
for x in range(0, self.effectif_total(), d_per_line)
]
2016-01-08 13:58:21 +00:00
# On ajoute les éléments manquant pour la dernière line
if d_last_line:
2019-10-13 19:01:31 +00:00
splited_data[-1] += [" "] * (d_per_line - d_last_line)
2016-01-08 13:58:21 +00:00
# Construction du tableau
2016-02-13 04:04:08 +00:00
latex = "\\begin{{tabular}}{{|c|*{{{nbr_col}}}{{c|}}}} \n".format(
2019-10-13 19:01:31 +00:00
nbr_col=d_per_line
)
2016-01-08 13:58:21 +00:00
latex += "\t\t \hline \n"
2019-10-13 19:01:31 +00:00
d_lines = [" & ".join(map(str, l)) for l in splited_data]
2016-01-08 13:58:21 +00:00
latex += " \\\\ \n \\hline \n".join(d_lines)
latex += " \\\\ \n \\hline \n"
latex += "\\end{tabular}"
2016-01-08 13:58:21 +00:00
return latex
# -----------------------------
# Reglages pour 'vim'
# vim:set autoindent expandtab tabstop=4 shiftwidth=4:
2016-02-13 03:29:26 +00:00
# cursor: 16 del