Coverage for src/ensae_projects/datainc/data_helper.py: 63%

123 statements  

« prev     ^ index     » next       coverage.py v7.1.0, created at 2023-07-20 04:37 +0200

1""" 

2@file 

3@brief Simple functions to process text files. 

4""" 

5import datetime 

6from pyquickhelper.loghelper import noLOG 

7from .data_exception import FileFormatException 

8 

9 

10def convert_dates(sd, option=None, exc=False): 

11 """ 

12 Converts a string into a date. 

13 

14 @param sd string 

15 @param option see below 

16 @param exc raise an exception 

17 @return string 

18 

19 * ``'F'``: dates must contain ``/`` and format is ``DD/MM/YY`` 

20 """ 

21 if option == "F": 

22 if "/" in sd: 

23 try: 

24 v2 = datetime.datetime.strptime(sd, "%d/%m/%y") 

25 return v2.strftime("%Y-%m-%d") 

26 except ValueError: 

27 pass 

28 return sd 

29 

30 

31def clean_column_name_sql_dump(i, line, hist, sep=";"): 

32 """ 

33 Removes quotes in a line which looks like: 

34 

35 :: 

36 

37 0; "a"; 'j"'; "r;" 

38 

39 @param i line number (unused) 

40 @param line line to process 

41 @param hist distribution of the number of columns 

42 @param sep line separator 

43 @return text line, number of columns 

44 """ 

45 vals = [] 

46 beg = -1 

47 ending = "" 

48 for ii, c in enumerate(line): 

49 if beg == -1: 

50 beg = ii 

51 if c in ('"', "'"): 

52 nxt = c 

53 else: 

54 nxt = sep 

55 elif c == nxt: 

56 if c == sep: 

57 vals.append(line[beg:ii].strip()) 

58 beg = -1 

59 else: 

60 nxt = sep 

61 elif c == "\n": 

62 ending = c 

63 if beg != -1: 

64 vals.append(line[beg:].strip()) 

65 if ending: 

66 return sep.join(vals) + ending, len(vals) 

67 else: 

68 return sep.join(vals), len(vals) 

69 

70 

71def change_encoding(infile, outfile, enc1, enc2="utf-8", 

72 process=None, fLOG=noLOG): 

73 """ 

74 Changes the encoding of a text file and removes quotes. 

75 By default *process* is @see fn process_line. 

76 

77 @param infile input file 

78 @param outfile output file 

79 @param enc1 encoding of the input file 

80 @param enc2 encoding of the output file 

81 @param process function which processes a line, see below 

82 @param fLOG logging function 

83 @return number of processed lines 

84 

85 function ``process`` :: 

86 

87 def process(line_number, line): 

88 # ... 

89 return line 

90 

91 See @see fn clean_column_name_sql_dump for an example. 

92 """ 

93 if process is None: 

94 def process_line(i, s): 

95 return s 

96 process = process_line 

97 with open(infile, "r", encoding=enc1) as f: 

98 with open(outfile, "w", encoding=enc2) as g: 

99 lasti = 0 

100 for i, line in enumerate(f): 

101 lasti = i 

102 if (i + 1) % 10000 == 0: 

103 fLOG(infile, "-", i + 1, "lines") 

104 g.write(process(i, line)) 

105 return lasti 

106 

107 

108def change_encoding_improve(infile, outfile, enc1, enc2="utf-8", 

109 process=None, fLOG=noLOG): 

110 """ 

111 Changes the encoding of a text file, removes quotes. 

112 By default *process* is @see fn process_line 

113 but the function has access to the distribution of the number of columns 

114 in the previous lines. 

115 

116 @param infile input file 

117 @param outfile output file 

118 @param enc1 encoding of the input file 

119 @param enc2 encoding of the output file 

120 @param process function which processes a line, see below 

121 @param fLOG logging function 

122 @return number of processed lines 

123 

124 function ``process`` :: 

125 

126 def process(line_number, line, histo_nb_columns): 

127 # ... 

128 return line, number_of_columns 

129 """ 

130 if process is None: 

131 def process_line(i, s, hist): 

132 return s, 0 

133 process = process_line 

134 hist = {} 

135 with open(infile, "r", encoding=enc1) as f: 

136 with open(outfile, "w", encoding=enc2) as g: 

137 lasti = 0 

138 for i, line in enumerate(f): 

139 lasti = i 

140 if (i + 1) % 10000 == 0: 

141 fLOG(infile, "-", i + 1, "lines") 

142 line, nb_col = process(i, line, hist) 

143 hist[nb_col] = hist.get(nb_col, 0) + 1 

144 g.write(line) 

145 return lasti 

146 

147 

148def enumerate_text_lines(filename, sep="\t", encoding="utf-8", quotes_as_str=False, header=True, 

149 clean_column_name=None, convert_float=False, option=None, skip=0, take=-1, 

150 fLOG=noLOG): 

151 """ 

152 Enumerates all lines from a text file and does some cleaning (see the list of parameters). 

153 

154 @param filename filename 

155 @param sep column separator 

156 @param header first row is header 

157 @param encoding encoding 

158 @param quotes_as_str surrounded by quotes 

159 @param clean_column_name function to clean column name 

160 @param convert_float convert number into float wherever possible 

161 @param option several option to clean dates, see below 

162 @param skip number of rows to skip 

163 @param take number of rows to consider (-1 for all) 

164 @param fLOG logging function 

165 @return iterator on dictionary 

166 

167 Options to cleaning dates: 

168 

169 * ``'F'``: dates must contain ``/`` and format is ``DD/MM/YY`` 

170 """ 

171 def get_schema(row, header, clean_column_name): 

172 if header: 

173 sch = [_.strip('"') for _ in row] 

174 if clean_column_name: 

175 sch = [clean_column_name(_) for _ in sch] 

176 return sch 

177 else: 

178 return ["c%00d" % i for i in range(len(row))] 

179 

180 def convert(s, convert_float): 

181 if convert_float: 

182 try: 

183 return float(s) 

184 except ValueError: 

185 return s 

186 else: 

187 return s 

188 

189 def clean_quotes(s, quotes_as_str): 

190 if quotes_as_str: 

191 if s and len(s) > 1 and s[0] == s[-1] and s[0] in ('"', "'"): 

192 return s[1:-1] 

193 return s 

194 

195 def clean_dates(fields, option): 

196 if option: 

197 if option == "F": 

198 update = {} 

199 for k, v in fields.items(): 

200 if "/" in v: 

201 try: 

202 v2 = datetime.datetime.strptime(v, "%d/%m/%y") 

203 update[k] = v2.strftime("%Y-%m-%d") 

204 except ValueError: 

205 continue 

206 if update: 

207 fields.update(update) 

208 return fields 

209 

210 with open(filename, "r", encoding=encoding) as f: 

211 d = 0 

212 nb = 0 

213 for i, line in enumerate(f): 

214 if nb >= take >= 0: 

215 break 

216 spl = line.strip("\r\n").split(sep) 

217 if i == 0: 

218 schema = get_schema(spl, header, clean_column_name) 

219 if header: 

220 d = 1 

221 continue 

222 if i + d < skip: 

223 continue 

224 if len(spl) != len(schema): 

225 if len(spl) == 1: 

226 # probably the last line 

227 continue 

228 raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format( 

229 len(schema), len(spl), i + 1)) 

230 val = {k: convert(clean_quotes(v, quotes_as_str), convert_float) 

231 for k, v in zip(schema, spl)} 

232 val = clean_dates(val, option) 

233 yield val 

234 nb += 1 

235 if nb % 100000 == 0: 

236 fLOG(filename, "-", nb, "lines")