Entry
Extracting data from HTML documents
Jul 5th, 2000 10:00
Nathan Wallace, unknown unknown, Hans Nowak, Snippet 114, Glyn Webster
"""
Packages: text.html
"""
""" Rough functions for extracting data from HTML documents.
Glyn Webster <glyn@ninz.org.nz> 2021-04-27
"""
import re
_mode = re.IGNORECASE | re.DOTALL
_body = re.compile(r'<BODY.*?>(.*)</BODY>', _mode)
_title = re.compile(r'<TITLE>(.*)</TITLE>', _mode)
_meta = re.compile(r'<META\s+NAME="(.*?)"\s+CONTENT="(.*?)"', _mode)
def body(html):
" Returns the body of an HTML document. "
m = _body.search(html)
if m:
return m.group(1)
else:
#If there's no <BODY> tags then whole thing is a body:
return html
def title(html):
" Returns the title of an html document. "
m = _title.search(html)
if m:
return m.group(1)
else:
return ""
def meta(html):
" Returns a dictionary of <META> tag data that maps NAME onto CONTENT. "
tags = {}
pos = 0
while 1:
m = _meta.search(html, pos)
if m:
tags[m.group(1)] = m.group(2)
pos = m.end()
else:
break
return tags