I recently needed some code to quickly scrape the metadata from XHTML Web pages, so I kicked up the following code:
import amara
XHTML1_NS = u'http://www.w3.org/1999/xhtml'
PREFIXES = { u'xh': XHTML1_NS }
def get_xhtml_metadata(source):
md = {}
for node in amara.pushbind(source, u'/xh:html/xh:head/*', prefixes=PREFIXES):
if node.localName == u'title':
md[u'title'] = unicode(node)
if node.localName == u'link':
linkinfo = dict([ (attr.name, unicode(attr))
for attr in node.xml_xpath(u'@*') ])
md.setdefault(u'links', []).append(linkinfo)
elif node.xml_xpath(u'self::xh:meta[@name]'):
md[node.name] = unicode(node.content)
return md
if __name__ == "__main__":
import sys, pprint
source = sys.argv[1]
pprint.pprint(get_xhtml_metadata(source))So, for example, scraping planet XML:
$ python xhtml-metadata.py http://planet.xmlhack.com/
{u'links': [{u'href': u'planet.css',
u'media': u'screen',
u'rel': u'stylesheet',
u'title': u'Default',
u'type': u'text/css'},
{u'href': u'/index.rdf',
u'rel': u'alternate',
u'title': u'RSS',
u'type': u'application/rss+xml'}],
u'title': u'Planet XMLhack: Aggregated weblogs from XML hackers and commentators'}via Copia