I recently needed some code to quickly scrape the metadata from XHTML Web pages, so I kicked up the following code:
import amara XHTML1_NS = u'http://www.w3.org/1999/xhtml' PREFIXES = { u'xh': XHTML1_NS } def get_xhtml_metadata(source): md = {} for node in amara.pushbind(source, u'/xh:html/xh:head/*', prefixes=PREFIXES): if node.localName == u'title': md[u'title'] = unicode(node) if node.localName == u'link': linkinfo = dict([ (attr.name, unicode(attr)) for attr in node.xml_xpath(u'@*') ]) md.setdefault(u'links', []).append(linkinfo) elif node.xml_xpath(u'self::xh:meta[@name]'): md[node.name] = unicode(node.content) return md if __name__ == "__main__": import sys, pprint source = sys.argv[1] pprint.pprint(get_xhtml_metadata(source))
So, for example, scraping planet XML:
$ python xhtml-metadata.py http://planet.xmlhack.com/ {u'links': [{u'href': u'planet.css', u'media': u'screen', u'rel': u'stylesheet', u'title': u'Default', u'type': u'text/css'}, {u'href': u'/index.rdf', u'rel': u'alternate', u'title': u'RSS', u'type': u'application/rss+xml'}], u'title': u'Planet XMLhack: Aggregated weblogs from XML hackers and commentators'}
via Copia