Coverage for pyquickhelper/pandashelper/tblformat.py: 100%

181 statements  

« prev     ^ index     » next       coverage.py v7.2.7, created at 2023-06-03 02:21 +0200

1# -*- coding:utf-8 -*- 

2""" 

3@file 

4@brief To format a pandas dataframe 

5""" 

6import warnings 

7 

8 

9def enumerate_split_df(df, common, subsets): 

10 """ 

11 Splits a dataframe by columns to display shorter 

12 dataframes. 

13 

14 @param df dataframe 

15 @param common common columns 

16 @param subsets subsets of columns 

17 @return split dataframes 

18 

19 .. runpython:: 

20 :showcode: 

21 

22 from pandas import DataFrame 

23 from pyquickhelper.pandashelper.tblformat import enumerate_split_df 

24 

25 df = DataFrame([{'A': 0, 'B': 'text'}, 

26 {'A': 1e-5, 'C': 'longer text'}]) 

27 res = list(enumerate_split_df(df, ['A'], [['B'], ['C']])) 

28 print(res[0]) 

29 print('-----') 

30 print(res[1]) 

31 """ 

32 for sub in subsets: 

33 if set(common) & set(sub): 

34 raise ValueError("Common columns between common={} and subset={}.".format( 

35 common, sub)) 

36 yield df[common + sub] 

37 

38 

39def df2rst(df, add_line=True, align="l", column_size=None, index=False, 

40 list_table=False, title=None, header=True, sep=',', 

41 number_format=None, replacements=None, split_row=None, 

42 split_row_level="+", split_col_common=None, 

43 split_col_subsets=None, filter_rows=None, 

44 label_pattern=None): 

45 """ 

46 Builds a string in :epkg:`RST` format from a :epkg:`dataframe`. 

47 

48 :param df: dataframe 

49 :param add_line: (bool) add a line separator between each row 

50 :param align: ``r`` or ``l`` or ``c`` 

51 :param column_size: something like ``[1, 2, 5]`` to multiply the column size, 

52 a dictionary (if *list_table* is False) to overwrite 

53 a column size like ``{'col_name1': 20}`` or ``{3: 20}`` 

54 :param index: add the index 

55 :param list_table: use the 

56 `list_table <http://docutils.sourceforge.net/docs/ref/rst/directives.html#list-table>`_ 

57 :param title: used only if *list_table* is True 

58 :param header: add one header 

59 :param sep: separator if *df* is a string and is a filename to load 

60 :param number_format: formats number in a specific way, if *number_format* 

61 is an integer, the pattern is replaced by 

62 ``{numpy.float64: '{:.2g}'}`` (if *number_format* is 2), 

63 see also :epkg:`pyformat.info` 

64 :param replacements: replacements just before converting into RST (dictionary) 

65 :param split_row: displays several table, one column is used as the 

66 name of each section 

67 :param split_row_level: title level if option *split_row* is used 

68 :param split_col_common: splits the dataframe by columns, see @see fn enumerate_split_df 

69 :param split_col_subsets: splits the dataframe by columns, see @see fn enumerate_split_df 

70 :param filter_rows: None or function to removes rows, signature 

71 ``def filter_rows(df: DataFrame) -> DataFrame`` 

72 :param label_pattern: if *split_row* is used, the function may insert 

73 a label in front of every section, example: ``".. _lpy-{section}:"`` 

74 :return: string 

75 

76 If *list_table* is False, the format is the following. 

77 

78 *None* values are replaced by empty string (4 spaces). 

79 It produces the following results: 

80 

81 :: 

82 

83 +------------------------+------------+----------+----------+ 

84 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

85 | (header rows optional) | | | | 

86 +========================+============+==========+==========+ 

87 | body row 1, column 1 | column 2 | column 3 | column 4 | 

88 +------------------------+------------+----------+----------+ 

89 | body row 2 | ... | ... | | 

90 +------------------------+------------+----------+----------+ 

91 

92 If *list_table* is True, the format is the following: 

93 

94 :: 

95 

96 .. list-table:: title 

97 :widths: 15 10 30 

98 :header-rows: 1 

99 

100 * - Treat 

101 - Quantity 

102 - Description 

103 * - Albatross 

104 - 2.99 

105 - anythings 

106 ... 

107 

108 .. exref:: 

109 :title: Convert a dataframe into RST 

110 

111 .. runpython:: 

112 :showcode: 

113 

114 from pandas import DataFrame 

115 from pyquickhelper.pandashelper import df2rst 

116 

117 df = DataFrame([{'A': 0, 'B': 'text'}, 

118 {'A': 1e-5, 'C': 'longer text'}]) 

119 print(df2rst(df)) 

120 

121 .. exref:: 

122 :title: Convert a dataframe into markdown 

123 

124 .. runpython:: 

125 :showcode: 

126 

127 from io import StringIO 

128 from textwrap import dedent 

129 import pandas 

130 

131 from_excel = dedent(''' 

132 Op;axes;shape;SpeedUp 

133 ReduceMax;(3,);(8, 24, 48, 8);2.96 

134 ReduceMax;(3,);(8, 24, 48, 16);2.57 

135 ReduceMax;(3,);(8, 24, 48, 32);2.95 

136 ReduceMax;(3,);(8, 24, 48, 64);3.28 

137 ReduceMax;(3,);(8, 24, 48, 100);3.05 

138 ReduceMax;(3,);(8, 24, 48, 128);3.11 

139 ReduceMax;(3,);(8, 24, 48, 200);2.86 

140 ReduceMax;(3,);(8, 24, 48, 256);2.50 

141 ReduceMax;(3,);(8, 24, 48, 400);2.48 

142 ReduceMax;(3,);(8, 24, 48, 512);2.90 

143 ReduceMax;(3,);(8, 24, 48, 1024);2.76 

144 ReduceMax;(0,);(8, 24, 48, 8);19.29 

145 ReduceMax;(0,);(8, 24, 48, 16);11.83 

146 ReduceMax;(0,);(8, 24, 48, 32);5.69 

147 ReduceMax;(0,);(8, 24, 48, 64);5.49 

148 ReduceMax;(0,);(8, 24, 48, 100);6.13 

149 ReduceMax;(0,);(8, 24, 48, 128);6.27 

150 ReduceMax;(0,);(8, 24, 48, 200);5.46 

151 ReduceMax;(0,);(8, 24, 48, 256);4.76 

152 ReduceMax;(0,);(8, 24, 48, 400);2.21 

153 ReduceMax;(0,);(8, 24, 48, 512);4.52 

154 ReduceMax;(0,);(8, 24, 48, 1024);4.38 

155 ReduceSum;(3,);(8, 24, 48, 8);1.79 

156 ReduceSum;(3,);(8, 24, 48, 16);0.79 

157 ReduceSum;(3,);(8, 24, 48, 32);1.67 

158 ReduceSum;(3,);(8, 24, 48, 64);1.19 

159 ReduceSum;(3,);(8, 24, 48, 100);2.08 

160 ReduceSum;(3,);(8, 24, 48, 128);2.96 

161 ReduceSum;(3,);(8, 24, 48, 200);1.66 

162 ReduceSum;(3,);(8, 24, 48, 256);2.26 

163 ReduceSum;(3,);(8, 24, 48, 400);1.76 

164 ReduceSum;(3,);(8, 24, 48, 512);2.61 

165 ReduceSum;(3,);(8, 24, 48, 1024);2.21 

166 ReduceSum;(0,);(8, 24, 48, 8);2.56 

167 ReduceSum;(0,);(8, 24, 48, 16);2.05 

168 ReduceSum;(0,);(8, 24, 48, 32);3.04 

169 ReduceSum;(0,);(8, 24, 48, 64);2.57 

170 ReduceSum;(0,);(8, 24, 48, 100);2.41 

171 ReduceSum;(0,);(8, 24, 48, 128);2.77 

172 ReduceSum;(0,);(8, 24, 48, 200);2.02 

173 ReduceSum;(0,);(8, 24, 48, 256);1.61 

174 ReduceSum;(0,);(8, 24, 48, 400);1.59 

175 ReduceSum;(0,);(8, 24, 48, 512);1.48 

176 ReduceSum;(0,);(8, 24, 48, 1024);1.50 

177 ''') 

178 

179 df = pandas.read_csv(StringIO(from_excel), sep=";") 

180 print(df.columns) 

181 

182 sub = df[["Op", "axes", "shape", "SpeedUp"]] 

183 piv = df.pivot_table(values="SpeedUp", index=['axes', "shape"], columns="Op") 

184 piv = piv.reset_index(drop=False) 

185 

186 print(piv.to_markdown(index=False)) 

187 

188 Nan value are replaced by empty string even if *number_format* is not None. 

189 """ 

190 if isinstance(df, str): 

191 import pandas # pragma: no cover 

192 df = pandas.read_csv( # pragma: no cover 

193 df, encoding="utf-8", sep=sep) 

194 

195 if split_row is not None: 

196 gdf = df.groupby(split_row) 

197 rows = [] 

198 for key, g in gdf: 

199 key = str(key).strip('()') 

200 if ':ref:' in key: 

201 try: 

202 key = key.split("`")[1].split("<")[0].strip() 

203 except IndexError: # pragma: no cover 

204 pass 

205 if label_pattern is not None: 

206 lab = label_pattern.format(section=key.replace(".", "D")) 

207 rows.append("") 

208 rows.append(lab) 

209 rows.append("") 

210 rows.append(key) 

211 rows.append(split_row_level * len(key)) 

212 rows.append("") 

213 rg = df2rst(g, add_line=add_line, align=align, 

214 column_size=column_size, index=index, 

215 list_table=list_table, 

216 title=title, header=header, sep=sep, 

217 number_format=number_format, replacements=replacements, 

218 split_row=None, split_row_level=None, 

219 split_col_common=split_col_common, 

220 split_col_subsets=split_col_subsets, 

221 filter_rows=filter_rows, 

222 label_pattern=None) 

223 rows.append(rg) 

224 rows.append("") 

225 return "\n".join(rows) 

226 

227 if split_col_common is not None: 

228 rows = [] 

229 for sub in enumerate_split_df(df, split_col_common, split_col_subsets): 

230 rg = df2rst(sub, add_line=add_line, align=align, 

231 column_size=column_size, index=index, 

232 list_table=list_table, 

233 title=title, header=header, sep=sep, 

234 number_format=number_format, 

235 replacements=replacements, 

236 filter_rows=filter_rows) 

237 rows.append(rg) 

238 rows.append('') 

239 return "\n".join(rows) 

240 

241 import numpy 

242 typstr = str 

243 

244 if filter_rows is not None: 

245 df = filter_rows(df).copy() 

246 if df.shape[0] == 0: 

247 return "" 

248 else: 

249 df = df.copy() 

250 

251 def patternification(value, pattern): 

252 if isinstance(value, float) and numpy.isnan(value): 

253 return "" 

254 return pattern.format(value) 

255 

256 def nan2empty(value): 

257 if isinstance(value, float) and numpy.isnan(value): 

258 return "" 

259 return value 

260 

261 if number_format is not None: 

262 if isinstance(number_format, int): 

263 number_format = "{:.%dg}" % number_format 

264 import pandas 

265 typ1 = numpy.float64 

266 _df = pandas.DataFrame({'f': [0.12]}) 

267 typ2 = list(_df.dtypes)[0] 

268 number_format = {typ1: number_format, typ2: number_format} 

269 df = df.copy() 

270 for name, typ in zip(df.columns, df.dtypes): 

271 if name in number_format: 

272 pattern = number_format[name] 

273 df[name] = df[name].apply( 

274 lambda x: patternification(x, pattern)) 

275 elif typ in number_format: 

276 pattern = number_format[typ] 

277 df[name] = df[name].apply( 

278 lambda x: patternification(x, pattern)) 

279 

280 # check empty strings 

281 col_strings = df.select_dtypes(include=[object]).columns 

282 for c in col_strings: 

283 df[c] = df[c].apply(nan2empty) 

284 

285 if index: 

286 df = df.reset_index(drop=False).copy() 

287 ind = df.columns[0] 

288 

289 def boldify(x): 

290 try: 

291 return f"**{x}**" 

292 except Exception as e: # pragma: no cover 

293 raise RuntimeError( 

294 f"Unable to boldify type {type(x)}") from e 

295 

296 try: 

297 values = df[ind].apply(boldify) 

298 except Exception: # pragma: no cover 

299 warnings.warn("Unable to boldify the index (1).", SyntaxWarning) 

300 

301 try: 

302 df[ind] = values 

303 except Exception: # pragma: no cover 

304 warnings.warn("Unable to boldify the index (2).", SyntaxWarning) 

305 

306 def align_string(s, align, length): 

307 if len(s) < length: 

308 if align == "l": 

309 return s + " " * (length - len(s)) 

310 if align == "r": 

311 return " " * (length - len(s)) + s 

312 if align == "c": 

313 m = (length - len(s)) // 2 

314 return " " * m + s + " " * (length - m - len(s)) 

315 raise ValueError( # pragma: no cover 

316 f"align should be 'l', 'r', 'c' not '{align}'") 

317 return s 

318 

319 def complete(cool): 

320 if list_table: 

321 i, s = cool 

322 if s is None: 

323 s = "" # pragma: no cover 

324 if isinstance(s, float) and numpy.isnan(s): 

325 s = "" 

326 else: 

327 s = typstr(s).replace("\n", " ") 

328 if replacements is not None: 

329 if s in replacements: 

330 s = replacements[s] 

331 return (" " + s) if s else s 

332 else: 

333 i, s = cool 

334 if s is None: 

335 s = " " * 4 # pragma: no cover 

336 if isinstance(s, float) and numpy.isnan(s): 

337 s = "" # pragma: no cover 

338 else: 

339 s = typstr(s).replace("\n", " ") 

340 i -= 2 

341 if replacements is not None: 

342 if s in replacements: 

343 s = replacements[s] 

344 s = align_string(s.strip(), align, i) 

345 return s 

346 

347 if list_table: 

348 

349 def format_on_row(row): 

350 one = "\n -".join(map(complete, enumerate(row))) 

351 res = " * -" + one 

352 return res 

353 

354 rows = [f".. list-table:: {title if title else ''}".strip()] 

355 if column_size is None: 

356 rows.append(" :widths: auto") 

357 else: 

358 rows.append(" :widths: " + " ".join(map(str, column_size))) 

359 if header: 

360 rows.append(" :header-rows: 1") 

361 rows.append("") 

362 if header: 

363 rows.append(format_on_row(df.columns)) 

364 rows.extend(map(format_on_row, df.values)) 

365 rows.append("") 

366 table = "\n".join(rows) 

367 return table 

368 else: 

369 length = [(len(_) if isinstance(_, typstr) else 5) for _ in df.columns] 

370 for row in df.values: 

371 for i, v in enumerate(row): 

372 length[i] = max(length[i], len(typstr(v).strip())) 

373 if column_size is not None: 

374 if isinstance(column_size, list): 

375 if len(length) != len(column_size): 

376 raise ValueError( # pragma: no cover 

377 "length and column_size should have the same size {0} != {1}".format( 

378 len(length), len(column_size))) 

379 for i in range(len(length)): 

380 if not isinstance(column_size[i], int): 

381 raise TypeError( # pragma: no cover 

382 f"column_size[{i}] is not an integer") 

383 length[i] *= column_size[i] 

384 elif isinstance(column_size, dict): 

385 for i, c in enumerate(df.columns): 

386 if c in column_size: 

387 length[i] = column_size[c] 

388 elif i in column_size: 

389 length[i] = column_size[i] 

390 else: 

391 raise TypeError( # pragma: no cover 

392 "column_size must be a list or a dictionary not {}".format( 

393 type(column_size))) 

394 

395 ic = 2 

396 length = [_ + ic for _ in length] 

397 line = ["-" * lc for lc in length] 

398 lineb = ["=" * lc for lc in length] 

399 sline = f"+{'+'.join(line)}+" 

400 slineb = f"+{'+'.join(lineb)}+" 

401 res = [sline] 

402 

403 res.append(f"| {' | '.join(map(complete, zip(length, df.columns)))} |") 

404 res.append(slineb) 

405 res.extend([f"| {' | '.join(map(complete, zip(length, row)))} |" 

406 for row in df.values]) 

407 if add_line: 

408 t = len(res) 

409 for i in range(t - 1, 3, -1): 

410 res.insert(i, sline) 

411 res.append(sline) 

412 table = "\n".join(res) + "\n" 

413 return table 

414 

415 

416def df2html(self, class_table=None, class_td=None, class_tr=None, 

417 class_th=None): 

418 """ 

419 Converts the table into a :epkg:`html` string. 

420 

421 :param self: dataframe (to be added as a class method) 

422 :param class_table: adds a class to the tag ``table`` (None for none) 

423 :param class_td: adds a class to the tag ``td`` (None for none) 

424 :param class_tr: adds a class to the tag ``tr`` (None for none) 

425 :param class_th: adds a class to the tag ``th`` (None for none) 

426 :return: HTML 

427 """ 

428 clta = f' class="{class_table}"' if class_table is not None else "" 

429 cltr = f' class="{class_tr}"' if class_tr is not None else "" 

430 cltd = f' class="{class_td}"' if class_td is not None else "" 

431 clth = f' class="{class_th}"' if class_th is not None else "" 

432 

433 rows = [f"<table{clta}>"] 

434 rows.append(f"<tr{cltr}><th{clth}>" + ("</th><th%s>" % 

435 clth).join(self.columns) + "</th></tr>") 

436 septd = f"</td><td{cltd}>" 

437 strtd = f"<tr{cltr}><td{cltd}>" 

438 

439 typstr = str 

440 

441 def conv(s): 

442 if s is None: 

443 return "" # pragma: no cover 

444 return typstr(s) 

445 

446 for row in self.values: 

447 s = septd.join(conv(_) for _ in row) 

448 rows.append(strtd + s + "</td></tr>") 

449 rows.append("</table>") 

450 rows.append("") 

451 return "\n".join(rows)