FAQTs - Knowledge Base - View Entry - Extracting data from HTML documents

faqts : Computers : Programming : Languages : Python : Snippets : Web Programming / Manipulating HTML files

+ Search

Entry

Extracting data from HTML documents

Jul 5th, 2000 10:00
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 114, Glyn Webster

"""
Packages: text.html
"""
""" Rough functions for extracting data from HTML documents. 
    Glyn Webster <glyn@ninz.org.nz> 2021-04-27
"""
import re
_mode = re.IGNORECASE | re.DOTALL
_body  = re.compile(r'<BODY.*?>(.*)</BODY>', _mode)
_title = re.compile(r'<TITLE>(.*)</TITLE>', _mode)
_meta  = re.compile(r'<META\s+NAME="(.*?)"\s+CONTENT="(.*?)"', _mode)
def body(html):
    " Returns the body of an HTML document. "
    m = _body.search(html)
    if m:
        return m.group(1)
    else:
        #If there's no <BODY> tags then whole thing is a body:
        return html  
def title(html):
    " Returns the title of an html document. "
    m = _title.search(html)
    if m:
        return m.group(1)
    else:
        return ""
def meta(html):
    " Returns a dictionary of <META> tag data that maps NAME onto CONTENT. "
    tags = {}
    pos = 0
    while 1:
        m = _meta.search(html, pos)
        if m:
            tags[m.group(1)] = m.group(2)
            pos = m.end()
        else:
            break
    return tags