import libxml2,re,os,string # convert the HTML to XHTML (if necessary) os.system("tidy -q -asxhtml < test-page.html > /tmp/test-page.xhtml 2> /dev/null") # parse the XML doc = libxml2.parseFile('/tmp/test-page.xhtml') # search all nodes having a class of ocr_line lines = doc.xpathEval("//*[@class='ocr_line']") # a function for extracting the text from a node def get_text(node): textnodes = node.xpathEval(".//text()") s = string.join([node.getContent() for node in textnodes]) return re.sub(r'\s+',' ',s) # a function for extracting the bbox property from a node # note that the title= attribute on a node with an ocr_ class must # conform with the OCR spec def get_bbox(node): data = node.prop('title') bboxre = re.compile(r'\bbbox\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)') return [int(x) for x in bboxre.search(data).groups()] # now, extract the for line in lines: print get_bbox(line),get_text(line)