Coverage for src/papierstat/datasets/biased.py: 100%
18 statements
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-30 06:49 +0200
« prev ^ index » next coverage.py v7.2.7, created at 2023-06-30 06:49 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Jeux de données biaisés.
5"""
6import numpy
7from scipy.special import expit # pylint: disable=E0611
8import pandas
9from sklearn.model_selection import train_test_split
12__all__ = ['load_biased']
15def load_biased(N=250):
16 """
17 Retourne un jeu de données biaisé.
19 :param N: number of observations
20 :return: :epkg:`pandas:DataFrame`
21 """
22 kids = numpy.array([0.6, 0.8, 0.9, 0.95, 1.])
24 data = []
25 for _ in range(0, N):
26 obs = dict(
27 age=numpy.random.randint(18, 65),
28 gender=numpy.random.randint(0, 2),
29 kids=numpy.searchsorted(kids, numpy.random.random()))
30 r = 1000 + (
31 expit(obs['age'] / 30.) *
32 (1000 + 1000 * numpy.random.random()))
33 r -= obs['kids'] * (400 + 100 * numpy.random.random())
34 r -= obs['gender'] * numpy.random.randint(0, 1)
35 r = max(r, 1000)
36 obs['R'] = r
37 data.append(obs)
38 return train_test_split(pandas.DataFrame(data))