Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1# coding: latin-1
2"""
3@file
5@brief contains a class which iterations on rows of a text file structured as a table.
7"""
10import re
11import os
12import decimal
14from pyquickhelper.loghelper import noLOG
15from pyquickhelper.loghelper.flog import GetPath
16from .file_text_binary import TextFile
19class TextFileColumns(TextFile):
20 """
21 This class opens a text file as if it were a binary file. It can deal with null characters.
22 The file is interpreted as a TSV file or file containing columns.
23 The separator is found automatically.
24 The columns are assumed to be in the first line but it is not mandatory.
25 It walks along a file through an iterator, every line is automatically converted into a dictionary ``{ column : value }``.
26 If the class was able to guess what type is which column, the conversion will automatically take place.
28 ::
30 f = TextFileColumns(filename)
31 # filename is a file
32 # the separator is unknown --> the class automatically determines it
33 # as well as the columns and their type
34 f.open()
35 for d in f:
36 print(d) # d is a dictionary
37 f.close()
39 @var _force_header there is a header even if not detected
40 @var _force_noheader there is no header even if detected
41 @var _changes replace the columns name
42 @var _regexfix impose a regular expression to interpret a line instead of the automatically built one
43 @var _filter_dict it is a function which takes a dictionary and returns a boolean which tells if the line must considered or not
44 @var _fields name of the columns (if there is no header)
46 Spaces and non-ascii characters cannot be used to name a column.
47 This name must be a named group for a regular expression.
48 """
50 def __init__(self, filename, errors=None, fLOG=noLOG, force_header=False, changes=None,
51 force_noheader=False, regex=None, filter=None, fields=None,
52 keep_text_when_bad_type=False, break_at=-1, strip_space=True,
53 force_sep=None, nb_line_guess=100, mistake=3, encoding="utf-8",
54 strict_separator=False):
55 """
56 @param filename filename
57 @param errors see str (errors = ...)
58 @param fLOG LOG function, see `fLOG
59 <http://www.xavierdupre.fr/app/pyquickhelper/helpsphinx/
60 pyquickhelper/loghelper/flog.html#pyquickhelper.loghelper.flog.fLOG>`_
61 @param force_header defines the first line as columns header whatever is it relevant or not
62 @param changes to change the column name, gives the correspondence, example: { "query":"query___" },
63 it can be a list if there is no header and you want to name any column
64 @param force_noheader there is no header at all
65 @param regex specify a different regular expression (only if changes is a list)
66 if it is a dictionary, the class will replace the default
67 by the one associated in regex for this field
68 @param filter None if there is no filter, otherwise it is a function
69 which takes a dictionary and returns a boolean
70 which tells if the line must considered or not
71 @param fields when the header is not here, these fields will name the columns
72 @param keep_text_when_bad_type keep the value when the conversion type does not word
73 @param break_at if != -1, stop when this limit is reached
74 @param strip_space remove space around columns if True
75 @param force_sep if != None, impose a column separator
76 @param nb_line_guess number of lines used to guess types
77 @param mistake not more than mistake conversion in numbers are allowed
78 @param encoding encoding
79 @param strict_separator strict number of columns, it assumes there is no separator in the content of every column
80 """
81 if changes is None:
82 changes = {}
84 TextFile.__init__(self, filename, errors, fLOG=fLOG, encoding=encoding)
86 self._force_header = force_header
87 self._force_noheader = force_noheader
88 self._changes = changes
89 self._regexfix = regex
90 self._filter_dict = filter
91 self._fields = fields
92 self._keep_text_when_bad_type = keep_text_when_bad_type
93 self._break_at = break_at
94 self._strip_space = strip_space
95 self._force_sep = force_sep
96 self._nb_guess_line = nb_line_guess
97 self._mistake = mistake
98 self._strict_separator = strict_separator
99 self._encoding = encoding
101 if isinstance(changes, list):
102 hhhh, _ = 0, len(changes)
103 while _ > 0:
104 hhhh, _ = hhhh, _ / 10 # pylint: disable=W0127
105 forma_ = "c%0" + str(hhhh) + "d"
107 self._changes = {}
108 for i, c in enumerate(changes):
109 self._changes[forma_ % i] = c
111 if self._regexfix is not None and \
112 not isinstance(self._regexfix, dict) and \
113 "(?P<" not in self._regexfix:
114 reg = re.compile("[(](.+?)[)]")
115 fi = reg.findall(self._regexfix)
116 if len(fi) != len(changes):
117 raise Exception(
118 "not the same number of fields in regular expression (%d,%d):\n%s\n%s" %
119 (len(fi), len(changes), str(fi), str(changes)))
120 exp = []
121 for a, b in zip(fi, changes):
122 s = "(?P<%s>%s)" % (b, a)
123 exp.append(s)
124 p = self._regexfix.find(")") + 1
125 s = self._regexfix[p]
126 self._regexfix = s.join(exp)
127 self.LOG("split: ", fi)
128 self.LOG("new regex: ", self._regexfix)
129 else:
130 self.LOG(" TextFileColumns (1): regex: ", self._regexfix)
131 else:
132 self.LOG(" TextFileColumns (2): regex: ", self._regexfix)
134 def __str__(self):
135 """
136 Returns the header.
137 """
138 return str(self.__dict__)
140 def get_columns(self):
141 """
142 @return the columns
143 """
144 if "_columns" not in self.__dict__:
145 raise Exception("there is no available columns")
146 return self._columns
148 def open(self):
149 """
150 Opens the file and find out if there is a header, what are the columns, what are their type...
151 any information about which format was found is logged.
152 """
153 if "_header" not in self.__dict__:
154 header, columns, sep, regex = self.guess_columns(force_header=self._force_header,
155 changes=self._changes,
156 force_noheader=self._force_noheader,
157 fields=self._fields,
158 regex=self._regexfix if isinstance(
159 self._regexfix,
160 dict) else {},
161 force_sep=self._force_sep,
162 nb=self._nb_guess_line,
163 mistake=self._mistake)
164 if self._regexfix is not None and not isinstance(
165 self._regexfix, dict):
166 regex = self._regexfix
167 self._header = header
168 self._columns = columns
169 self._sep = sep
170 try:
171 self._regex = re.compile(regex)
172 except Exception as e:
173 raise RuntimeError( # pylint: disable=W0707
174 "algorithm problem: (type %r, %r)\nunable to understand a regular expression (file %r)\nexp: %r" %
175 (str(type(e)), str(e), self.filename, regex))
176 self._name = {}
177 self._nb = 0
178 self._conv = {}
179 for k, v in self._columns.items():
180 self._name[v[0]] = (k, v[1])
181 if v[1] in [int, float, decimal.Decimal]:
182 self._conv[v[0]] = v[1]
183 self._nb += 1
184 TextFile.open(self)
186 def close(self):
187 """
188 Closes the file and remove all information related to the format,
189 next time it is opened, the format will be checked again.
190 """
191 TextFile.close(self)
192 self._nb -= 1
193 if self._nb == 0:
194 del self.__dict__["_header"]
195 del self.__dict__["_columns"]
196 del self.__dict__["_regex"]
197 del self.__dict__["_name"]
198 del self.__dict__["_conv"]
200 def __iter__(self):
201 """
202 @return a dictionary ``{ column_name: value }``
203 """
204 class tempo__:
206 def __init__(self, r):
207 self.res = r
209 def groupdict(self):
210 return self.res
212 if "_header" not in self.__dict__:
213 raise Exception("file not open %s" % self.filename)
215 regex_simple = re.compile(self._regex.pattern.replace(">.*)", ">.*?)"))
217 nb = 0
218 nberr = 0
219 nbert = 0
220 for line in TextFile.__iter__(self):
221 if nb == 0 and self._header:
222 nb += 1
223 continue
225 tempc = line.split(self._sep)
227 if len(tempc) == len(self._columns):
228 res = {}
229 for i, a in enumerate(tempc):
230 res[self._columns[i][0]] = a
231 r = tempo__(res)
232 elif not self._strict_separator:
233 if len(tempc) < len(self._columns):
234 # impossible
235 r = None
236 else:
237 # conflicts...
238 r = regex_simple.match(line)
239 if r is None:
240 r = self._regex.match(line)
241 else:
242 r = None
244 if r is None:
245 if nberr == 0:
246 self.LOG(self._regex.pattern)
247 self.LOG(
248 "error regex",
249 nberr,
250 "unable to interpret line ",
251 nb,
252 ": ",
253 repr(line))
254 nberr += 1
255 if nberr * 10 > nb and nberr > 4:
256 message = "pattern: %s\n line: %s" % (
257 regex_simple.pattern, line)
258 raise Exception(
259 "(a) there are probably too many errors %d (%d)\n%s" %
260 (nberr, nb, message))
261 else:
262 res = r.groupdict()
263 if self._strip_space:
264 for k in res:
265 res[k] = res[k].strip()
266 giveup = False
268 for k in res:
269 if k in self._conv:
270 try:
271 if len(res[k]) == 0 and (self._conv[k] == int or self._conv[
272 k] == float or self._conv[k] == decimal.Decimal):
273 ttt = self._conv[k](0)
274 else:
275 ttt = self._conv[k](res[k])
276 res[k] = ttt
277 except ValueError:
278 nbert += 1
279 if self._keep_text_when_bad_type:
280 if nbert % 1000 == 1:
281 self.LOG(
282 "error type",
283 nbert,
284 "unable to interpret line (but keep it) ",
285 nb,
286 "value",
287 repr(
288 res[k]),
289 " type ",
290 repr(
291 self._conv[k]),
292 " line ",
293 repr(line))
294 else:
295 self.LOG(
296 "error type", nbert, "unable to interpret line ", nb, "value", repr(
297 res[k]), " type ", repr(
298 self._conv[k]), " line ", repr(line))
299 if nbert * 10 > nb and nbert > 4:
300 message = "pattern: %s\n line: %s" % (
301 regex_simple.pattern, line)
302 raise RuntimeError( # pylint: disable=W0707
303 "(b) there are probably too many errors %r\n%r" %
304 (nberr, message))
305 giveup = True
306 break
307 if giveup:
308 continue
309 if self._filter_dict is None or self._filter_dict(res):
310 yield res
312 nb += 1
313 if self._break_at != -1 and nb > self._break_at:
314 break
316 @staticmethod
317 def _store(output, la, encoding="utf-8"):
318 """
319 Stores a list of dictionaries into a file (add a header).
321 @param output filename
322 @param la list of dictionary key:value
323 @param encoding encoding
324 @warning format is utf-8
325 """
326 sepline = "\n" # GetSepLine ()
327 f = open(output, "w", encoding=encoding)
328 nbline = 0
329 for d in la:
330 if nbline == 0:
331 keys = list(d.keys())
332 keys.sort()
333 f.write("\t".join(keys) + sepline)
335 val = [str(d[k]) for k in keys]
336 s = "\t".join(val)
337 f.write(s + sepline)
339 nbline += 1
340 f.close()
342 def sort(self, output, key, maxmemory=2 ** 28, folder=None, fLOG=noLOG):
343 """
344 Sorts a text file, even a big one, one or several columns gives the order.
346 @param output output file result
347 @param key lines sorted depending of these columns
348 @param maxmemory a file is split into smaller files which contains not more than maxmemory lines
349 @param folder the function needs to create temporary files, this folder will contain them
350 before they get removed
351 @param fLOG logging function
352 @return
354 @warning We assume this file is not opened.
355 """
356 if isinstance(key, str):
357 key = (key,)
358 if folder is None:
359 folder = GetPath()
360 if not os.path.exists(folder):
361 raise Exception("unable to find folder %s" % folder)
363 try:
364 file = open(output, "w", encoding=self._encoding)
365 file.close()
366 except Exception as e:
367 raise RuntimeError( # pylint: disable=W0707
368 "Unable to create file %r, reason: %r" %
369 (output, str(e)))
371 self.LOG("sorting file ", self.filename)
372 #root = self.filename.replace (":", "_").replace ("/", "_").replace ("\\", "_").replace (".", "_")
373 files = []
374 memo = []
375 self.open()
376 for line in self:
377 try:
378 k = tuple(line[k] for k in key)
379 except KeyError as e:
380 raise Exception("unable to find one column in\n{0}".format(
381 self.get_columns())) from e
382 memo.append((k, line))
383 if len(memo) > maxmemory:
384 memo.sort(key=lambda el: el[0])
385 memo = [la[1] for la in memo]
386 tempout = os.path.join(folder, "root_%05d.txt" % len(files))
387 self.LOG("writing file %d lines in " % len(memo), tempout)
388 TextFileColumns._store(tempout, memo)
389 files.append(tempout)
390 memo = []
392 if len(memo) > 0:
393 memo.sort(key=lambda el: el[0])
394 memo = [la[1] for la in memo]
395 tempout = os.path.join(folder, "root_%05d.txt" % len(files))
396 self.LOG("writing file %d lines in " % len(memo), tempout)
397 TextFileColumns._store(tempout, memo)
398 files.append(tempout)
399 memo = []
401 self.close()
403 TextFileColumns.fusion(
404 key,
405 files,
406 output,
407 force_header=self._force_header,
408 fLOG=self.LOG)
409 for m in files:
410 self.LOG("removing ", m)
411 os.remove(m)
413 @staticmethod
414 def fusion(key, files, output, force_header=False, encoding="utf-8", fLOG=noLOG):
415 """
416 Does a fusion between several files
417 with the same columns (different order is allowed).
419 @param key columns to be compared
420 @param files list of files
421 @param output output file
422 @param force_header impose the first line as a header
423 @param encoding encoding
424 @param fLOG logging function
425 @warning We assume all files are sorted depending on columns in key
426 """
427 fh = []
428 for f in files:
429 h = TextFileColumns(f, force_header=force_header,
430 encoding=encoding, fLOG=fLOG)
431 h.open()
432 fh.append([h, iter(h)])
434 res = open(output, "w", encoding=encoding)
435 nbline = 0
436 sepline = "\n" # GetSepLine ()
437 if isinstance(key, str):
438 key = [key]
440 # start
441 kline = []
442 for li in fh:
443 try:
444 if li[1] is None:
445 d = None
446 else:
447 d = li[1].__next__()
448 except StopIteration:
449 d = None
450 if d is not None:
451 try:
452 k = tuple(d[k] for k in key)
453 except KeyError as e:
454 raise Exception("unable to find one column in\n{0}".format(
455 li[0].get_columns())) from e
456 kline.append([k, d] + li)
458 # loop
459 while len(kline) > 0:
461 # minimum
462 mi = None
463 for i, line in enumerate(kline):
464 if mi is None or line[0] < mi:
465 mi = line[0]
466 pos = i
468 # picking
469 line = kline[pos]
470 del kline[pos]
472 # adding
473 d = line[1]
474 if nbline == 0:
475 keys = list(d.keys())
476 keys.sort()
477 res.write("\t".join(keys) + sepline)
479 val = [str(d[k_]) for k_ in keys]
480 s = "\t".join(val)
481 res.write(s + sepline)
482 nbline += 1
484 # next
485 try:
486 d = line[-1].__next__()
487 except StopIteration:
488 d = None
490 if d is not None:
491 k = tuple(d[k_] for k_ in key)
492 kline.append([k, d] + line[2:])
494 # end
495 for li in fh:
496 li[0].close()
497 res.close()