Parse HTML

views

Simple HTML and XHTML parser


Example HTML Parser Application

from html.parser import HTMLParser


class MyHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):

        print("Encountered a start tag:", tag)


    def handle_endtag(self, tag):

        print("Encountered an end tag :", tag)


    def handle_data(self, data):

        print("Encountered some data  :", data)


parser = MyHTMLParser()

parser.feed('<html><head><title>Test</title></head>'

            '<body><h1>Parse me!</h1></body></html>')


Encountered a start tag: html

Encountered a start tag: head

Encountered a start tag: title

Encountered some data  : Test

Encountered an end tag : title

Encountered an end tag : head

Encountered a start tag: body

Encountered a start tag: h1

Encountered some data  : Parse me!

Encountered an end tag : h1

Encountered an end tag : body

Encountered an end tag : html


HTMLParser Methods

from html.parser import HTMLParser

from html.entities import name2codepoint


class MyHTMLParser(HTMLParser):

    def handle_starttag(self, tag, attrs):

        print("Start tag:", tag)

        for attr in attrs:

            print("     attr:", attr)


    def handle_endtag(self, tag):

        print("End tag  :", tag)


    def handle_data(self, data):

        print("Data     :", data)


    def handle_comment(self, data):

        print("Comment  :", data)


    def handle_entityref(self, name):

        c = chr(name2codepoint[name])

        print("Named ent:", c)


    def handle_charref(self, name):

        if name.startswith('x'):

            c = chr(int(name[1:], 16))

        else:

            c = chr(int(name))

        print("Num ent  :", c)


    def handle_decl(self, data):

        print("Decl     :", data)


parser = MyHTMLParser()


>>> parser.feed('<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" '

...             '"http://www.w3.org/TR/html4/strict.dtd">')

Decl     : DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN" "http://www.w3.org/TR/html4/strict.dtd"


>>> parser.feed('<img src="python-logo.png" alt="The Python logo">')

Start tag: img

     attr: ('src', 'python-logo.png')

     attr: ('alt', 'The Python logo')

>>>

>>> parser.feed('<h1>Python</h1>')

Start tag: h1

Data     : Python

End tag  : h1


>>> parser.feed('<style type="text/css">#python { color: green }</style>')

Start tag: style

     attr: ('type', 'text/css')

Data     : #python { color: green }

End tag  : style


>>> parser.feed('<script type="text/javascript">'

...             'alert("<strong>hello!</strong>");</script>')

Start tag: script

     attr: ('type', 'text/javascript')

Data     : alert("<strong>hello!</strong>");

End tag  : script


>>> parser.feed('<!-- a comment -->'

...             '<!--[if IE 9]>IE-specific content<![endif]-->')

Comment  :  a comment

Comment  : [if IE 9]>IE-specific content<![endif]


>>> parser.feed('&gt;&#62;&#x3E;')

Named ent: >

Num ent  : >

Num ent  : >


>>> for chunk in ['<sp', 'an>buff', 'ered ', 'text</s', 'pan>']:

...     parser.feed(chunk)

...

Start tag: span

Data     : buff

Data     : ered

Data     : text

End tag  : span


>>> parser.feed('<p><a class=link href=#main>tag soup</p ></a>')

Start tag: p

Start tag: a

     attr: ('class', 'link')

     attr: ('href', '#main')

Data     : tag soup

End tag  : p

End tag  : a

Seedbacklink