Source code for trafilatura.baseline

# pylint:disable-msg=E0611
import re

from lxml.etree import Element, SubElement

from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim


JSON_SEARCH = re.compile(r'"articlebody": *"(.+?)(?<!\\)"', re.I)



def basic_cleaning(tree):
    "Remove a few section types from the document."
    for elem in BASIC_CLEAN_XPATH(tree):
        elem.getparent().remove(elem)
    return tree


[docs] def baseline(filecontent): """Use baseline extraction function targeting text paragraphs and/or JSON metadata. Args: filecontent: HTML code as binary string or string. Returns: A LXML <body> element containing the extracted paragraphs, the main text as string, and its length as integer. """ tree = load_html(filecontent) postbody = Element('body') if tree is None: return postbody, '', 0 # scrape from json text for elem in tree.iterfind('.//script[@type="application/ld+json"]'): if elem.text and '"article' in elem.text: mymatch = JSON_SEARCH.search(elem.text) if mymatch: elem = SubElement(postbody, 'p') elem.text = trim(mymatch[1].replace('\\"', '"')) return postbody, elem.text, len(elem.text) tree = basic_cleaning(tree) # scrape from article tag article_elem = tree.find('.//article') if article_elem is not None: temp_text = trim(article_elem.text_content()) if len(temp_text) > 100: elem = SubElement(postbody, 'p') elem.text = temp_text return postbody, temp_text, len(temp_text) # scrape from text paragraphs results = set() for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'): entry = element.text_content() if entry not in results: elem = SubElement(postbody, 'p') elem.text = entry results.add(entry) temp_text = trim('\n'.join(postbody.itertext())) if len(temp_text) > 100: return postbody, temp_text, len(temp_text) # default strategy: clean the tree and take everything postbody = Element('body') body_elem = tree.find('.//body') if body_elem is not None: # elem.text = trim(body_elem.text_content()) text = '\n'.join([trim(e) for e in body_elem.itertext()]) if len(text) > 100: elem = SubElement(postbody, 'p') elem.text = text return postbody, text, len(text) # new fallback text = html2txt(tree) elem = SubElement(postbody, 'p') elem.text = text return postbody, text, len(text)
# old: return postbody, '', 0
[docs] def html2txt(content): """Run basic html2txt on a document. Args: content: HTML document as string or LXML element. Returns: The extracted text in the form of a string or an empty string. """ tree = load_html(content) if tree is None: return "" body = tree.find(".//body") if body is None: return "" tree = basic_cleaning(tree) return " ".join(body.text_content().split()).strip()