Cheat Sheet on HTML#
Links: notebook
, html, PDF
, python
, slides, GitHub
Parse HTML and extract information.
from jyquickhelper import add_notebook_menu
add_notebook_menu()
Parse with Python and convert it into JSON#
Inspired from Convert HTML into JSON. We convet a page into JSON.
import html.parser
class HTMLtoJSONParser(html.parser.HTMLParser):
def __init__(self, raise_exception = True) :
html.parser.HTMLParser.__init__(self)
self.doc = { }
self.path = []
self.cur = self.doc
self.line = 0
self.raise_exception = raise_exception
@property
def json(self):
return self.doc
@staticmethod
def to_json(content, raise_exception = True):
parser = HTMLtoJSONParser(raise_exception = raise_exception)
parser.feed(content)
return parser.json
def handle_starttag(self, tag, attrs):
self.path.append(tag)
attrs = { k:v for k,v in attrs }
if tag in self.cur :
if isinstance(self.cur[tag],list) :
self.cur[tag].append( { "__parent__": self.cur } )
self.cur = self.cur[tag][-1]
else :
self.cur[tag] = [ self.cur[tag] ]
self.cur[tag].append( { "__parent__": self.cur } )
self.cur = self.cur[tag][-1]
else :
self.cur[tag] = { "__parent__": self.cur }
self.cur = self.cur[tag]
for a, v in attrs.items():
self.cur["#" + a] = v
self.cur[""] = ""
def handle_endtag(self, tag):
if tag != self.path[-1] and self.raise_exception :
raise Exception("html is malformed around line: {0} (it might be because " \
"of a tag <br>, <hr>, <img .. > not closed)".format(self.line))
del self.path[-1]
memo = self.cur
self.cur = self.cur["__parent__"]
self.clean(memo)
def handle_data(self, data):
self.line += data.count("\n")
if "" in self.cur :
self.cur[""] += data
def clean(self, values):
keys = list(values.keys())
for k in keys:
v = values[k]
if isinstance(v, str) :
#print ("clean", k,[v])
c = v.strip(" \n\r\t")
if c != v :
if len(c) > 0 :
values[k] = c
else :
del values[k]
del values["__parent__"]
The following Informations surfaciques du PLU (doc. du 10.09.2010) de la commune de Bannay contains some links we need to extract. We cache the page to avoid loading it again every time we run the script.
import os
cache = "cache_content.html.bytes"
if not os.path.exists(cache):
import urllib.request
url = "https://www.data.gouv.fr/fr/datasets/informations-surfaciques-du-plu-doc-du-10-09-2010-de-la-commune-de-bannay/"
with urllib.request.urlopen(url) as response:
content = response.read()
with open(cache, "wb") as f:
f.write(content)
else:
with open(cache, "rb") as f:
content = f.read()
type(content)
bytes
We need to convert it into str. The encoding should utf-8.
page = content.decode("utf-8")
type(page)
str
We catch an error if there is any.
try:
js = HTMLtoJSONParser.to_json(page)
error = False
except Exception as e:
error = True
print(e)
html is malformed around line: 66 (it might be because of a tag <br>, <hr>, <img .. > not closed)
Let’s see:
if error:
lines = page.split("\n")
line = 42
around = 2
begin = max(0, line-around)
end = min(len(lines), line+around)
for i in range(begin, end):
print("%03d %s" % (i, lines[i]))
else:
print("No error.")
040 <meta name="check-urls" content="true" />
041 <meta name="check-urls-ignore" content="%5B%5D" />
042 <meta name="territory-enabled" content="true">
043
HTML format is very often malformed and browsers are used to it. That’s why there exists module such as beautifulsoup.
With beautifulsoup#
Very easy to extract all urls.
from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
url = list(soup.find_all("a"))
url[:4]
[<a class="navbar-brand" href="/fr/">Data.gouv.fr</a>,
<a href="/fr/faq/">Découvrez l'OpenData</a>,
<a href="/fr/faq/citizen/">En tant que citoyen</a>,
<a href="/fr/faq/producer/">En tant que producteur</a>]
About JSON, because I don’t want to change my code too much, I use prettify before calling the code above.
with open("clean_content.html", "w", encoding="utf-8") as f:
f.write(soup.prettify())
js = HTMLtoJSONParser.to_json(soup.prettify())
Now, I use javascript to go through it.
from jyquickhelper import JSONJS
JSONJS(js)