Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1# -*- coding:utf-8 -*- 

2""" 

3@file 

4@brief To format a pandas dataframe 

5""" 

6import warnings 

7 

8 

9def enumerate_split_df(df, common, subsets): 

10 """ 

11 Splits a dataframe by columns to display shorter 

12 dataframes. 

13 

14 @param df dataframe 

15 @param common common columns 

16 @param subsets subsets of columns 

17 @return split dataframes 

18 

19 .. runpython:: 

20 :showcode: 

21 

22 from pandas import DataFrame 

23 from pyquickhelper.pandashelper.tblformat import enumerate_split_df 

24 

25 df = DataFrame([{'A': 0, 'B': 'text'}, 

26 {'A': 1e-5, 'C': 'longer text'}]) 

27 res = list(enumerate_split_df(df, ['A'], [['B'], ['C']])) 

28 print(res[0]) 

29 print('-----') 

30 print(res[1]) 

31 """ 

32 for sub in subsets: 

33 if set(common) & set(sub): 

34 raise ValueError("Common columns between common={} and subset={}.".format( 

35 common, sub)) 

36 yield df[common + sub] 

37 

38 

39def df2rst(df, add_line=True, align="l", column_size=None, index=False, 

40 list_table=False, title=None, header=True, sep=',', 

41 number_format=None, replacements=None, split_row=None, 

42 split_row_level="+", split_col_common=None, 

43 split_col_subsets=None, filter_rows=None, 

44 label_pattern=None): 

45 """ 

46 Builds a string in :epkg:`RST` format from a :epkg:`dataframe`. 

47 

48 :param df: dataframe 

49 :param add_line: (bool) add a line separator between each row 

50 :param align: ``r`` or ``l`` or ``c`` 

51 :param column_size: something like ``[1, 2, 5]`` to multiply the column size, 

52 a dictionary (if *list_table* is False) to overwrite 

53 a column size like ``{'col_name1': 20}`` or ``{3: 20}`` 

54 :param index: add the index 

55 :param list_table: use the 

56 `list_table <http://docutils.sourceforge.net/docs/ref/rst/directives.html#list-table>`_ 

57 :param title: used only if *list_table* is True 

58 :param header: add one header 

59 :param sep: separator if *df* is a string and is a filename to load 

60 :param number_format: formats number in a specific way, if *number_format* 

61 is an integer, the pattern is replaced by 

62 ``{numpy.float64: '{:.2g}'}`` (if *number_format* is 2), 

63 see also :epkg:`pyformat.info` 

64 :param replacements: replacements just before converting into RST (dictionary) 

65 :param split_row: displays several table, one column is used as the 

66 name of each section 

67 :param split_row_level: title level if option *split_row* is used 

68 :param split_col_common: splits the dataframe by columns, see @see fn enumerate_split_df 

69 :param split_col_subsets: splits the dataframe by columns, see @see fn enumerate_split_df 

70 :param filter_rows: None or function to removes rows, signature 

71 ``def filter_rows(df: DataFrame) -> DataFrame`` 

72 :param label_pattern: if *split_row* is used, the function may insert 

73 a label in front of every section, example: ``".. _lpy-{section}:"`` 

74 :return: string 

75 

76 If *list_table* is False, the format is the following. 

77 

78 *None* values are replaced by empty string (4 spaces). 

79 It produces the following results: 

80 

81 :: 

82 

83 +------------------------+------------+----------+----------+ 

84 | Header row, column 1 | Header 2 | Header 3 | Header 4 | 

85 | (header rows optional) | | | | 

86 +========================+============+==========+==========+ 

87 | body row 1, column 1 | column 2 | column 3 | column 4 | 

88 +------------------------+------------+----------+----------+ 

89 | body row 2 | ... | ... | | 

90 +------------------------+------------+----------+----------+ 

91 

92 If *list_table* is True, the format is the following: 

93 

94 :: 

95 

96 .. list-table:: title 

97 :widths: 15 10 30 

98 :header-rows: 1 

99 

100 * - Treat 

101 - Quantity 

102 - Description 

103 * - Albatross 

104 - 2.99 

105 - anythings 

106 ... 

107 

108 .. exref:: 

109 :title: Convert a dataframe into RST 

110 

111 .. runpython:: 

112 :showcode: 

113 

114 from pandas import DataFrame 

115 from pyquickhelper.pandashelper import df2rst 

116 

117 df = DataFrame([{'A': 0, 'B': 'text'}, 

118 {'A': 1e-5, 'C': 'longer text'}]) 

119 print(df2rst(df)) 

120 

121 .. exref:: 

122 :title: Convert a dataframe into markdown 

123 

124 .. runpython:: 

125 :showcode: 

126 

127 from io import StringIO 

128 from textwrap import dedent 

129 import pandas 

130 

131 from_excel = dedent(''' 

132 Op;axes;shape;SpeedUp 

133 ReduceMax;(3,);(8, 24, 48, 8);2.96 

134 ReduceMax;(3,);(8, 24, 48, 16);2.57 

135 ReduceMax;(3,);(8, 24, 48, 32);2.95 

136 ReduceMax;(3,);(8, 24, 48, 64);3.28 

137 ReduceMax;(3,);(8, 24, 48, 100);3.05 

138 ReduceMax;(3,);(8, 24, 48, 128);3.11 

139 ReduceMax;(3,);(8, 24, 48, 200);2.86 

140 ReduceMax;(3,);(8, 24, 48, 256);2.50 

141 ReduceMax;(3,);(8, 24, 48, 400);2.48 

142 ReduceMax;(3,);(8, 24, 48, 512);2.90 

143 ReduceMax;(3,);(8, 24, 48, 1024);2.76 

144 ReduceMax;(0,);(8, 24, 48, 8);19.29 

145 ReduceMax;(0,);(8, 24, 48, 16);11.83 

146 ReduceMax;(0,);(8, 24, 48, 32);5.69 

147 ReduceMax;(0,);(8, 24, 48, 64);5.49 

148 ReduceMax;(0,);(8, 24, 48, 100);6.13 

149 ReduceMax;(0,);(8, 24, 48, 128);6.27 

150 ReduceMax;(0,);(8, 24, 48, 200);5.46 

151 ReduceMax;(0,);(8, 24, 48, 256);4.76 

152 ReduceMax;(0,);(8, 24, 48, 400);2.21 

153 ReduceMax;(0,);(8, 24, 48, 512);4.52 

154 ReduceMax;(0,);(8, 24, 48, 1024);4.38 

155 ReduceSum;(3,);(8, 24, 48, 8);1.79 

156 ReduceSum;(3,);(8, 24, 48, 16);0.79 

157 ReduceSum;(3,);(8, 24, 48, 32);1.67 

158 ReduceSum;(3,);(8, 24, 48, 64);1.19 

159 ReduceSum;(3,);(8, 24, 48, 100);2.08 

160 ReduceSum;(3,);(8, 24, 48, 128);2.96 

161 ReduceSum;(3,);(8, 24, 48, 200);1.66 

162 ReduceSum;(3,);(8, 24, 48, 256);2.26 

163 ReduceSum;(3,);(8, 24, 48, 400);1.76 

164 ReduceSum;(3,);(8, 24, 48, 512);2.61 

165 ReduceSum;(3,);(8, 24, 48, 1024);2.21 

166 ReduceSum;(0,);(8, 24, 48, 8);2.56 

167 ReduceSum;(0,);(8, 24, 48, 16);2.05 

168 ReduceSum;(0,);(8, 24, 48, 32);3.04 

169 ReduceSum;(0,);(8, 24, 48, 64);2.57 

170 ReduceSum;(0,);(8, 24, 48, 100);2.41 

171 ReduceSum;(0,);(8, 24, 48, 128);2.77 

172 ReduceSum;(0,);(8, 24, 48, 200);2.02 

173 ReduceSum;(0,);(8, 24, 48, 256);1.61 

174 ReduceSum;(0,);(8, 24, 48, 400);1.59 

175 ReduceSum;(0,);(8, 24, 48, 512);1.48 

176 ReduceSum;(0,);(8, 24, 48, 1024);1.50 

177 ''') 

178 

179 df = pandas.read_csv(StringIO(from_excel), sep=";") 

180 print(df.columns) 

181 

182 sub = df[["Op", "axes", "shape", "SpeedUp"]] 

183 piv = df.pivot_table(values="SpeedUp", index=['axes', "shape"], columns="Op") 

184 piv = piv.reset_index(drop=False) 

185 

186 print(piv.to_markdown(index=False)) 

187 

188 .. versionchanged:: 1.9 

189 Nan value are replaced by empty string even if 

190 *number_format* is not None. 

191 Parameters *replacements*, *split_row*, *split_col_subsets*, 

192 *split_col_common*, *filter_rows* were added. 

193 """ 

194 if isinstance(df, str): 

195 import pandas # pragma: no cover 

196 df = pandas.read_csv( # pragma: no cover 

197 df, encoding="utf-8", sep=sep) 

198 

199 if split_row is not None: 

200 gdf = df.groupby(split_row) 

201 rows = [] 

202 for key, g in gdf: 

203 key = str(key).strip('()') 

204 if ':ref:' in key: 

205 try: 

206 key = key.split("`")[1].split("<")[0].strip() 

207 except IndexError: # pragma: no cover 

208 pass 

209 if label_pattern is not None: 

210 lab = label_pattern.format(section=key.replace(".", "D")) 

211 rows.append("") 

212 rows.append(lab) 

213 rows.append("") 

214 rows.append(key) 

215 rows.append(split_row_level * len(key)) 

216 rows.append("") 

217 rg = df2rst(g, add_line=add_line, align=align, 

218 column_size=column_size, index=index, 

219 list_table=list_table, 

220 title=title, header=header, sep=sep, 

221 number_format=number_format, replacements=replacements, 

222 split_row=None, split_row_level=None, 

223 split_col_common=split_col_common, 

224 split_col_subsets=split_col_subsets, 

225 filter_rows=filter_rows, 

226 label_pattern=None) 

227 rows.append(rg) 

228 rows.append("") 

229 return "\n".join(rows) 

230 

231 if split_col_common is not None: 

232 rows = [] 

233 for sub in enumerate_split_df(df, split_col_common, split_col_subsets): 

234 rg = df2rst(sub, add_line=add_line, align=align, 

235 column_size=column_size, index=index, 

236 list_table=list_table, 

237 title=title, header=header, sep=sep, 

238 number_format=number_format, 

239 replacements=replacements, 

240 filter_rows=filter_rows) 

241 rows.append(rg) 

242 rows.append('') 

243 return "\n".join(rows) 

244 

245 import numpy 

246 typstr = str 

247 

248 if filter_rows is not None: 

249 df = filter_rows(df).copy() 

250 if df.shape[0] == 0: 

251 return "" 

252 else: 

253 df = df.copy() 

254 

255 def patternification(value, pattern): 

256 if isinstance(value, float) and numpy.isnan(value): 

257 return "" 

258 return pattern.format(value) 

259 

260 def nan2empty(value): 

261 if isinstance(value, float) and numpy.isnan(value): 

262 return "" 

263 return value 

264 

265 if number_format is not None: 

266 if isinstance(number_format, int): 

267 number_format = "{:.%dg}" % number_format 

268 import pandas 

269 typ1 = numpy.float64 

270 _df = pandas.DataFrame({'f': [0.12]}) 

271 typ2 = list(_df.dtypes)[0] 

272 number_format = {typ1: number_format, typ2: number_format} 

273 df = df.copy() 

274 for name, typ in zip(df.columns, df.dtypes): 

275 if name in number_format: 

276 pattern = number_format[name] 

277 df[name] = df[name].apply( 

278 lambda x: patternification(x, pattern)) 

279 elif typ in number_format: 

280 pattern = number_format[typ] 

281 df[name] = df[name].apply( 

282 lambda x: patternification(x, pattern)) 

283 

284 # check empty strings 

285 col_strings = df.select_dtypes(include=[object]).columns 

286 for c in col_strings: 

287 df[c] = df[c].apply(nan2empty) 

288 

289 if index: 

290 df = df.reset_index(drop=False).copy() 

291 ind = df.columns[0] 

292 

293 def boldify(x): 

294 try: 

295 return "**{0}**".format(x) 

296 except Exception as e: # pragma: no cover 

297 raise Exception( 

298 "Unable to boldify type {0}".format(type(x))) from e 

299 

300 try: 

301 values = df[ind].apply(boldify) 

302 except Exception: # pragma: no cover 

303 warnings.warn("Unable to boldify the index (1).", SyntaxWarning) 

304 

305 try: 

306 df[ind] = values 

307 except Exception: # pragma: no cover 

308 warnings.warn("Unable to boldify the index (2).", SyntaxWarning) 

309 

310 def align_string(s, align, length): 

311 if len(s) < length: 

312 if align == "l": 

313 return s + " " * (length - len(s)) 

314 if align == "r": 

315 return " " * (length - len(s)) + s 

316 if align == "c": 

317 m = (length - len(s)) // 2 

318 return " " * m + s + " " * (length - m - len(s)) 

319 raise ValueError( # pragma: no cover 

320 "align should be 'l', 'r', 'c' not '{0}'".format(align)) 

321 return s 

322 

323 def complete(cool): 

324 if list_table: 

325 i, s = cool 

326 if s is None: 

327 s = "" # pragma: no cover 

328 if isinstance(s, float) and numpy.isnan(s): 

329 s = "" 

330 else: 

331 s = typstr(s).replace("\n", " ") 

332 if replacements is not None: 

333 if s in replacements: 

334 s = replacements[s] 

335 return (" " + s) if s else s 

336 else: 

337 i, s = cool 

338 if s is None: 

339 s = " " * 4 # pragma: no cover 

340 if isinstance(s, float) and numpy.isnan(s): 

341 s = "" # pragma: no cover 

342 else: 

343 s = typstr(s).replace("\n", " ") 

344 i -= 2 

345 if replacements is not None: 

346 if s in replacements: 

347 s = replacements[s] 

348 s = align_string(s.strip(), align, i) 

349 return s 

350 

351 if list_table: 

352 

353 def format_on_row(row): 

354 one = "\n -".join(map(complete, enumerate(row))) 

355 res = " * -" + one 

356 return res 

357 

358 rows = [".. list-table:: {0}".format(title if title else "").strip()] 

359 if column_size is None: 

360 rows.append(" :widths: auto") 

361 else: 

362 rows.append(" :widths: " + " ".join(map(str, column_size))) 

363 if header: 

364 rows.append(" :header-rows: 1") 

365 rows.append("") 

366 if header: 

367 rows.append(format_on_row(df.columns)) 

368 rows.extend(map(format_on_row, df.values)) 

369 rows.append("") 

370 table = "\n".join(rows) 

371 return table 

372 else: 

373 length = [(len(_) if isinstance(_, typstr) else 5) for _ in df.columns] 

374 for row in df.values: 

375 for i, v in enumerate(row): 

376 length[i] = max(length[i], len(typstr(v).strip())) 

377 if column_size is not None: 

378 if isinstance(column_size, list): 

379 if len(length) != len(column_size): 

380 raise ValueError( # pragma: no cover 

381 "length and column_size should have the same size {0} != {1}".format( 

382 len(length), len(column_size))) 

383 for i in range(len(length)): 

384 if not isinstance(column_size[i], int): 

385 raise TypeError( # pragma: no cover 

386 "column_size[{0}] is not an integer".format(i)) 

387 length[i] *= column_size[i] 

388 elif isinstance(column_size, dict): 

389 for i, c in enumerate(df.columns): 

390 if c in column_size: 

391 length[i] = column_size[c] 

392 elif i in column_size: 

393 length[i] = column_size[i] 

394 else: 

395 raise TypeError( # pragma: no cover 

396 "column_size must be a list or a dictionary not {}".format( 

397 type(column_size))) 

398 

399 ic = 2 

400 length = [_ + ic for _ in length] 

401 line = ["-" * lc for lc in length] 

402 lineb = ["=" * lc for lc in length] 

403 sline = "+%s+" % ("+".join(line)) 

404 slineb = "+%s+" % ("+".join(lineb)) 

405 res = [sline] 

406 

407 res.append("| %s |" % " | ".join( 

408 map(complete, zip(length, df.columns)))) 

409 res.append(slineb) 

410 res.extend(["| %s |" % " | ".join(map(complete, zip(length, row))) 

411 for row in df.values]) 

412 if add_line: 

413 t = len(res) 

414 for i in range(t - 1, 3, -1): 

415 res.insert(i, sline) 

416 res.append(sline) 

417 table = "\n".join(res) + "\n" 

418 return table 

419 

420 

421def df2html(self, class_table=None, class_td=None, class_tr=None, 

422 class_th=None): 

423 """ 

424 Converts the table into a :epkg:`html` string. 

425 

426 :param self: dataframe (to be added as a class method) 

427 :param class_table: adds a class to the tag ``table`` (None for none) 

428 :param class_td: adds a class to the tag ``td`` (None for none) 

429 :param class_tr: adds a class to the tag ``tr`` (None for none) 

430 :param class_th: adds a class to the tag ``th`` (None for none) 

431 :return: HTML 

432 """ 

433 clta = ' class="%s"' % class_table if class_table is not None else "" 

434 cltr = ' class="%s"' % class_tr if class_tr is not None else "" 

435 cltd = ' class="%s"' % class_td if class_td is not None else "" 

436 clth = ' class="%s"' % class_th if class_th is not None else "" 

437 

438 rows = ["<table%s>" % clta] 

439 rows.append(("<tr%s><th%s>" % (cltr, clth)) + ("</th><th%s>" % 

440 clth).join(self.columns) + "</th></tr>") 

441 septd = "</td><td%s>" % cltd 

442 strtd = "<tr%s><td%s>" % (cltr, cltd) 

443 

444 typstr = str 

445 

446 def conv(s): 

447 if s is None: 

448 return "" # pragma: no cover 

449 return typstr(s) 

450 

451 for row in self.values: 

452 s = septd.join(conv(_) for _ in row) 

453 rows.append(strtd + s + "</td></tr>") 

454 rows.append("</table>") 

455 rows.append("") 

456 return "\n".join(rows)