Coverage for src/ensae_projects/datainc/data_helper.py: 63%
123 statements
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
« prev ^ index » next coverage.py v7.1.0, created at 2023-07-20 04:37 +0200
1"""
2@file
3@brief Simple functions to process text files.
4"""
5import datetime
6from pyquickhelper.loghelper import noLOG
7from .data_exception import FileFormatException
10def convert_dates(sd, option=None, exc=False):
11 """
12 Converts a string into a date.
14 @param sd string
15 @param option see below
16 @param exc raise an exception
17 @return string
19 * ``'F'``: dates must contain ``/`` and format is ``DD/MM/YY``
20 """
21 if option == "F":
22 if "/" in sd:
23 try:
24 v2 = datetime.datetime.strptime(sd, "%d/%m/%y")
25 return v2.strftime("%Y-%m-%d")
26 except ValueError:
27 pass
28 return sd
31def clean_column_name_sql_dump(i, line, hist, sep=";"):
32 """
33 Removes quotes in a line which looks like:
35 ::
37 0; "a"; 'j"'; "r;"
39 @param i line number (unused)
40 @param line line to process
41 @param hist distribution of the number of columns
42 @param sep line separator
43 @return text line, number of columns
44 """
45 vals = []
46 beg = -1
47 ending = ""
48 for ii, c in enumerate(line):
49 if beg == -1:
50 beg = ii
51 if c in ('"', "'"):
52 nxt = c
53 else:
54 nxt = sep
55 elif c == nxt:
56 if c == sep:
57 vals.append(line[beg:ii].strip())
58 beg = -1
59 else:
60 nxt = sep
61 elif c == "\n":
62 ending = c
63 if beg != -1:
64 vals.append(line[beg:].strip())
65 if ending:
66 return sep.join(vals) + ending, len(vals)
67 else:
68 return sep.join(vals), len(vals)
71def change_encoding(infile, outfile, enc1, enc2="utf-8",
72 process=None, fLOG=noLOG):
73 """
74 Changes the encoding of a text file and removes quotes.
75 By default *process* is @see fn process_line.
77 @param infile input file
78 @param outfile output file
79 @param enc1 encoding of the input file
80 @param enc2 encoding of the output file
81 @param process function which processes a line, see below
82 @param fLOG logging function
83 @return number of processed lines
85 function ``process`` ::
87 def process(line_number, line):
88 # ...
89 return line
91 See @see fn clean_column_name_sql_dump for an example.
92 """
93 if process is None:
94 def process_line(i, s):
95 return s
96 process = process_line
97 with open(infile, "r", encoding=enc1) as f:
98 with open(outfile, "w", encoding=enc2) as g:
99 lasti = 0
100 for i, line in enumerate(f):
101 lasti = i
102 if (i + 1) % 10000 == 0:
103 fLOG(infile, "-", i + 1, "lines")
104 g.write(process(i, line))
105 return lasti
108def change_encoding_improve(infile, outfile, enc1, enc2="utf-8",
109 process=None, fLOG=noLOG):
110 """
111 Changes the encoding of a text file, removes quotes.
112 By default *process* is @see fn process_line
113 but the function has access to the distribution of the number of columns
114 in the previous lines.
116 @param infile input file
117 @param outfile output file
118 @param enc1 encoding of the input file
119 @param enc2 encoding of the output file
120 @param process function which processes a line, see below
121 @param fLOG logging function
122 @return number of processed lines
124 function ``process`` ::
126 def process(line_number, line, histo_nb_columns):
127 # ...
128 return line, number_of_columns
129 """
130 if process is None:
131 def process_line(i, s, hist):
132 return s, 0
133 process = process_line
134 hist = {}
135 with open(infile, "r", encoding=enc1) as f:
136 with open(outfile, "w", encoding=enc2) as g:
137 lasti = 0
138 for i, line in enumerate(f):
139 lasti = i
140 if (i + 1) % 10000 == 0:
141 fLOG(infile, "-", i + 1, "lines")
142 line, nb_col = process(i, line, hist)
143 hist[nb_col] = hist.get(nb_col, 0) + 1
144 g.write(line)
145 return lasti
148def enumerate_text_lines(filename, sep="\t", encoding="utf-8", quotes_as_str=False, header=True,
149 clean_column_name=None, convert_float=False, option=None, skip=0, take=-1,
150 fLOG=noLOG):
151 """
152 Enumerates all lines from a text file and does some cleaning (see the list of parameters).
154 @param filename filename
155 @param sep column separator
156 @param header first row is header
157 @param encoding encoding
158 @param quotes_as_str surrounded by quotes
159 @param clean_column_name function to clean column name
160 @param convert_float convert number into float wherever possible
161 @param option several option to clean dates, see below
162 @param skip number of rows to skip
163 @param take number of rows to consider (-1 for all)
164 @param fLOG logging function
165 @return iterator on dictionary
167 Options to cleaning dates:
169 * ``'F'``: dates must contain ``/`` and format is ``DD/MM/YY``
170 """
171 def get_schema(row, header, clean_column_name):
172 if header:
173 sch = [_.strip('"') for _ in row]
174 if clean_column_name:
175 sch = [clean_column_name(_) for _ in sch]
176 return sch
177 else:
178 return ["c%00d" % i for i in range(len(row))]
180 def convert(s, convert_float):
181 if convert_float:
182 try:
183 return float(s)
184 except ValueError:
185 return s
186 else:
187 return s
189 def clean_quotes(s, quotes_as_str):
190 if quotes_as_str:
191 if s and len(s) > 1 and s[0] == s[-1] and s[0] in ('"', "'"):
192 return s[1:-1]
193 return s
195 def clean_dates(fields, option):
196 if option:
197 if option == "F":
198 update = {}
199 for k, v in fields.items():
200 if "/" in v:
201 try:
202 v2 = datetime.datetime.strptime(v, "%d/%m/%y")
203 update[k] = v2.strftime("%Y-%m-%d")
204 except ValueError:
205 continue
206 if update:
207 fields.update(update)
208 return fields
210 with open(filename, "r", encoding=encoding) as f:
211 d = 0
212 nb = 0
213 for i, line in enumerate(f):
214 if nb >= take >= 0:
215 break
216 spl = line.strip("\r\n").split(sep)
217 if i == 0:
218 schema = get_schema(spl, header, clean_column_name)
219 if header:
220 d = 1
221 continue
222 if i + d < skip:
223 continue
224 if len(spl) != len(schema):
225 if len(spl) == 1:
226 # probably the last line
227 continue
228 raise FileFormatException("different number of columns: schema {0} != {1} for line {2}".format(
229 len(schema), len(spl), i + 1))
230 val = {k: convert(clean_quotes(v, quotes_as_str), convert_float)
231 for k, v in zip(schema, spl)}
232 val = clean_dates(val, option)
233 yield val
234 nb += 1
235 if nb % 100000 == 0:
236 fLOG(filename, "-", nb, "lines")