Cheat Sheet on HTML#

Links: notebook, html, PDF, python, slides, GitHub

Parse HTML and extract information.

from jyquickhelper import add_notebook_menu
add_notebook_menu()

Parse with Python and convert it into JSON#

Inspired from Convert HTML into JSON. We convet a page into JSON.

import html.parser

class HTMLtoJSONParser(html.parser.HTMLParser):
    def __init__(self, raise_exception = True) :
        html.parser.HTMLParser.__init__(self)
        self.doc  = { }
        self.path = []
        self.cur  = self.doc
        self.line = 0
        self.raise_exception = raise_exception

    @property
    def json(self):
        return self.doc

    @staticmethod
    def to_json(content, raise_exception = True):
        parser = HTMLtoJSONParser(raise_exception = raise_exception)
        parser.feed(content)
        return parser.json

    def handle_starttag(self, tag, attrs):
        self.path.append(tag)
        attrs = { k:v for k,v in attrs }
        if tag in self.cur :
            if isinstance(self.cur[tag],list) :
                self.cur[tag].append(  { "__parent__": self.cur } )
                self.cur = self.cur[tag][-1]
            else :
                self.cur[tag] = [ self.cur[tag] ]
                self.cur[tag].append(  { "__parent__": self.cur } )
                self.cur = self.cur[tag][-1]
        else :
            self.cur[tag] = { "__parent__": self.cur }
            self.cur = self.cur[tag]

        for a, v in attrs.items():
            self.cur["#" + a] = v
        self.cur[""] = ""

    def handle_endtag(self, tag):
        if tag != self.path[-1] and self.raise_exception :
            raise Exception("html is malformed around line: {0} (it might be because " \
                            "of a tag <br>, <hr>, <img .. > not closed)".format(self.line))
        del self.path[-1]
        memo = self.cur
        self.cur = self.cur["__parent__"]
        self.clean(memo)

    def handle_data(self, data):
        self.line += data.count("\n")
        if "" in self.cur :
            self.cur[""] += data

    def clean(self, values):
        keys = list(values.keys())
        for k in keys:
            v = values[k]
            if isinstance(v, str) :
                #print ("clean", k,[v])
                c = v.strip(" \n\r\t")
                if c != v :
                    if len(c) > 0 :
                        values[k] = c
                    else :
                        del values[k]
        del values["__parent__"]

The following Informations surfaciques du PLU (doc. du 10.09.2010) de la commune de Bannay contains some links we need to extract. We cache the page to avoid loading it again every time we run the script.

import os
cache = "cache_content.html.bytes"
if not os.path.exists(cache):
    import urllib.request
    url = "https://www.data.gouv.fr/fr/datasets/informations-surfaciques-du-plu-doc-du-10-09-2010-de-la-commune-de-bannay/"
    with urllib.request.urlopen(url) as response:
       content = response.read()
    with open(cache, "wb") as f:
        f.write(content)
else:
    with open(cache, "rb") as f:
        content = f.read()
type(content)
bytes

We need to convert it into str. The encoding should utf-8.

page = content.decode("utf-8")
type(page)
str

We catch an error if there is any.

try:
    js = HTMLtoJSONParser.to_json(page)
    error = False
except Exception as e:
    error = True
    print(e)
html is malformed around line: 66 (it might be because of a tag <br>, <hr>, <img .. > not closed)

Let’s see:

if error:
    lines = page.split("\n")
    line = 42
    around = 2
    begin = max(0, line-around)
    end = min(len(lines), line+around)
    for i in range(begin, end):
        print("%03d %s" % (i, lines[i]))
else:
    print("No error.")
040 <meta name="check-urls" content="true" />
041 <meta name="check-urls-ignore" content="%5B%5D" />
042 <meta name="territory-enabled" content="true">
043

HTML format is very often malformed and browsers are used to it. That’s why there exists module such as beautifulsoup.

With beautifulsoup#

Very easy to extract all urls.

from bs4 import BeautifulSoup
soup = BeautifulSoup(content, 'html.parser')
url = list(soup.find_all("a"))
url[:4]
[<a class="navbar-brand" href="/fr/">Data.gouv.fr</a>,
 <a href="/fr/faq/">Découvrez l'OpenData</a>,
 <a href="/fr/faq/citizen/">En tant que citoyen</a>,
 <a href="/fr/faq/producer/">En tant que producteur</a>]

About JSON, because I don’t want to change my code too much, I use prettify before calling the code above.

with open("clean_content.html", "w", encoding="utf-8") as f:
    f.write(soup.prettify())
js = HTMLtoJSONParser.to_json(soup.prettify())

Now, I use javascript to go through it.

from jyquickhelper import JSONJS
JSONJS(js)