Coverage for src/ensae_teaching_cs/data/datatext.py: 94%

17 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-01-27 05:44 +0100

1# -*- coding: utf-8 -*- 

2""" 

3@file 

4@brief Jeux de données reliés aux vins. 

5""" 

6import os 

7import pandas 

8 

9 

10def load_sentiment_dataset(cache="."): 

11 """ 

12 Retourne un ensemble de phrases en anglais avec 

13 assorties d'un sentiment positif ou négatif. 

14 Source : 

15 `Sentiment Labelled Sentences Data Set <https://archive.ics.uci.edu/ml/datasets/Sentiment+Labelled+Sentences>`_. 

16 

17 @param cache where to cache or unzip the data if downloaded a second time 

18 @return text content (str) 

19 """ 

20 from pyensae.datasource import download_data 

21 # url = "https://archive.ics.uci.edu/ml/machine-learning-databases/00331/" 

22 name = "sentiment_labelled_sentences.zip" 

23 res = download_data(name, whereTo=cache) 

24 if len(res) != 9: 

25 raise ValueError(f"Unzipping '{name}' failed.") 

26 dfs = [] 

27 for fi in res: 

28 if ".txt" not in fi or "readme" in fi or "__MACOSX" in fi: 

29 continue 

30 df = pandas.read_csv(fi, sep='\t', quoting=3, 

31 names=['sentance', 'sentiment']) 

32 df["source"] = os.path.splitext(os.path.split(fi)[-1])[0] 

33 dfs.append(df) 

34 return pandas.concat(dfs)