Coverage for src/ensae_teaching_cs/data/datatext.py: 94%
17 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-04-28 06:23 +0200
1# -*- coding: utf-8 -*-
2"""
3@file
4@brief Jeux de données reliés aux vins.
5"""
6import os
7import pandas
10def load_sentiment_dataset(cache="."):
11 """
12 Retourne un ensemble de phrases en anglais avec
13 assorties d'un sentiment positif ou négatif.
14 Source :
15 `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences>`_.
17 @param cache where to cache or unzip the data if downloaded a second time
18 @return text content (str)
19 """
20 from pyensae.datasource import download_data
21 # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/"
22 name = "sentiment_labelled_sentences.zip"
23 res = download_data(name, whereTo=cache)
24 if len(res) != 9:
25 raise ValueError(f"Unzipping '{name}' failed.")
26 dfs = []
27 for fi in res:
28 if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi:
29 continue
30 df = pandas.read_csv(fi, sep='\t', quoting=3,
31 names=['sentance', 'sentiment'])
32 df["source"] = os.path.splitext(os.path.split(fi)[-1])[0]
33 dfs.append(df)
34 return pandas.concat(dfs)