Quick grab of XHTML metadata

I recently needed some code to quickly scrape the metadata from XHTML Web pages, so I kicked up the following code:

import amara

XHTML1_NS = u'http://www.w3.org/1999/xhtml'
PREFIXES = { u'xh': XHTML1_NS }

def get_xhtml_metadata(source):
    md = {}
    for node in amara.pushbind(source, u'/xh:html/xh:head/*', prefixes=PREFIXES):
        if node.localName == u'title':
            md[u'title'] = unicode(node)
        if node.localName == u'link':
            linkinfo = dict([ (attr.name, unicode(attr))
                              for attr in node.xml_xpath(u'@*') ])
            md.setdefault(u'links', []).append(linkinfo)
        elif node.xml_xpath(u'self::xh:meta[@name]'):
            md[node.name] = unicode(node.content)
    return md

if __name__ == "__main__":
    import sys, pprint
    source = sys.argv[1]
    pprint.pprint(get_xhtml_metadata(source))

So, for example, scraping planet XML:

$ python xhtml-metadata.py http://planet.xmlhack.com/
{u'links': [{u'href': u'planet.css',
             u'media': u'screen',
             u'rel': u'stylesheet',
             u'title': u'Default',
             u'type': u'text/css'},
            {u'href': u'/index.rdf',
             u'rel': u'alternate',
             u'title': u'RSS',
             u'type': u'application/rss+xml'}],
 u'title': u'Planet XMLhack: Aggregated weblogs from XML hackers and commentators'}

[Uche Ogbuji]

via Copia