Source code for trafilatura.baseline
# pylint:disable-msg=E0611
import re
from lxml.etree import Element, SubElement
from .settings import BASIC_CLEAN_XPATH
from .utils import load_html, trim
JSON_SEARCH = re.compile(r'"articlebody": *"(.+?)(?<!\\)"', re.I)
def basic_cleaning(tree):
"Remove a few section types from the document."
for elem in BASIC_CLEAN_XPATH(tree):
elem.getparent().remove(elem)
return tree
[docs]
def baseline(filecontent):
"""Use baseline extraction function targeting text paragraphs and/or JSON metadata.
Args:
filecontent: HTML code as binary string or string.
Returns:
A LXML <body> element containing the extracted paragraphs,
the main text as string, and its length as integer.
"""
tree = load_html(filecontent)
postbody = Element('body')
if tree is None:
return postbody, '', 0
# scrape from json text
for elem in tree.iterfind('.//script[@type="application/ld+json"]'):
if elem.text and '"article' in elem.text:
mymatch = JSON_SEARCH.search(elem.text)
if mymatch:
elem = SubElement(postbody, 'p')
elem.text = trim(mymatch[1].replace('\\"', '"'))
return postbody, elem.text, len(elem.text)
tree = basic_cleaning(tree)
# scrape from article tag
article_elem = tree.find('.//article')
if article_elem is not None:
temp_text = trim(article_elem.text_content())
if len(temp_text) > 100:
elem = SubElement(postbody, 'p')
elem.text = temp_text
return postbody, temp_text, len(temp_text)
# scrape from text paragraphs
results = set()
for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'):
entry = element.text_content()
if entry not in results:
elem = SubElement(postbody, 'p')
elem.text = entry
results.add(entry)
temp_text = trim('\n'.join(postbody.itertext()))
if len(temp_text) > 100:
return postbody, temp_text, len(temp_text)
# default strategy: clean the tree and take everything
postbody = Element('body')
body_elem = tree.find('.//body')
if body_elem is not None:
# elem.text = trim(body_elem.text_content())
text = '\n'.join([trim(e) for e in body_elem.itertext()])
if len(text) > 100:
elem = SubElement(postbody, 'p')
elem.text = text
return postbody, text, len(text)
# new fallback
text = html2txt(tree)
elem = SubElement(postbody, 'p')
elem.text = text
return postbody, text, len(text)
# old: return postbody, '', 0
[docs]
def html2txt(content):
"""Run basic html2txt on a document.
Args:
content: HTML document as string or LXML element.
Returns:
The extracted text in the form of a string or an empty string.
"""
tree = load_html(content)
if tree is None:
return ""
body = tree.find(".//body")
if body is None:
return ""
tree = basic_cleaning(tree)
return " ".join(body.text_content().split()).strip()