# pylint:disable-msg=E0611,I1101
"""
All functions needed to steer and execute downloads of web documents.
"""
import logging
import random
import warnings
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial
from io import BytesIO
from time import sleep
import certifi
import urllib3
try:
import pycurl
CURL_SHARE = pycurl.CurlShare()
# available options:
# https://curl.se/libcurl/c/curl_share_setopt.html
CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_DNS)
CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_SSL_SESSION)
# not thread-safe
# CURL_SHARE.setopt(pycurl.SH_SHARE, pycurl.LOCK_DATA_CONNECT)
except ImportError:
pycurl = None
from courlan import UrlStore
from courlan.network import redirection_test
try: # Python 3.8+
from importlib.metadata import version
except ImportError:
from importlib_metadata import version
from .settings import DEFAULT_CONFIG, Extractor
from .utils import URL_BLACKLIST_REGEX, decode_file, make_chunks
LOGGER = logging.getLogger(__name__)
NUM_CONNECTIONS = 50
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
HTTP_POOL = None
NO_CERT_POOL = None
RETRY_STRATEGY = None
DEFAULT_HEADERS = urllib3.util.make_headers(accept_encoding=True)
USER_AGENT = 'trafilatura/' + version("trafilatura") + ' (+https://github.com/adbar/trafilatura)'
DEFAULT_HEADERS['User-Agent'] = USER_AGENT
class Response:
"Store information gathered in a HTTP response object."
__slots__ = ["data", "headers", "html", "status", "url"]
def __init__(self, data, status, url):
self.data = data
self.headers = None
self.html = None
self.status = status
self.url = url
def __bool__(self):
return self.data is not None
def __repr__(self):
return self.html if self.html else decode_file(self.data)
def __str__(self):
return self.__repr__()
def store_headers(self, headerdict):
"Store response headers if required."
# further control steps here
self.headers = {k.lower(): v for k, v in headerdict.items()}
def decode_data(self, decode):
"Decode the bytestring in data and store a string in html."
if decode and self.data:
self.html = decode_file(self.data)
def as_dict(self):
"Convert the response object to a dictionary."
return {
attr: getattr(self, attr)
for attr in self.__slots__
if hasattr(self, attr)
}
# caching throws an error
# @lru_cache(maxsize=2)
def _parse_config(config):
'Read and extract HTTP header strings from the configuration file.'
# load a series of user-agents
myagents = config.get('DEFAULT', 'USER_AGENTS').strip() or None
if myagents is not None and myagents != '':
myagents = myagents.split("\n")
# https://developer.mozilla.org/en-US/docs/Web/HTTP/Cookies
# todo: support for several cookies?
mycookie = config.get('DEFAULT', 'COOKIE') or None
return myagents, mycookie
def _determine_headers(config, headers=None):
'Internal function to decide on user-agent string.'
if config != DEFAULT_CONFIG:
myagents, mycookie = _parse_config(config)
headers = {}
if myagents is not None:
rnumber = random.randint(0, len(myagents) - 1)
headers['User-Agent'] = myagents[rnumber]
if mycookie is not None:
headers['Cookie'] = mycookie
return headers or DEFAULT_HEADERS
def _send_urllib_request(url, no_ssl, with_headers, config):
"Internal function to robustly send a request (SSL or not) and return its result."
# customize headers
global HTTP_POOL, NO_CERT_POOL, RETRY_STRATEGY
if not RETRY_STRATEGY:
RETRY_STRATEGY = urllib3.util.Retry(
total=config.getint("DEFAULT", "MAX_REDIRECTS"),
redirect=config.getint("DEFAULT", "MAX_REDIRECTS"), # raise_on_redirect=False,
connect=0,
backoff_factor=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT')/2,
status_forcelist=[
429, 499, 500, 502, 503, 504, 509, 520, 521, 522, 523, 524, 525, 526, 527, 530, 598
],
# unofficial: https://en.wikipedia.org/wiki/List_of_HTTP_status_codes#Unofficial_codes
)
try:
# TODO: read by streaming chunks (stream=True, iter_content=xx)
# so we can stop downloading as soon as MAX_FILE_SIZE is reached
if no_ssl is False:
# define pool
if not HTTP_POOL:
HTTP_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), ca_certs=certifi.where(), num_pools=NUM_CONNECTIONS) # cert_reqs='CERT_REQUIRED'
# execute request
response = HTTP_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
else:
# define pool
if not NO_CERT_POOL:
NO_CERT_POOL = urllib3.PoolManager(retries=RETRY_STRATEGY, timeout=config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'), cert_reqs='CERT_NONE', num_pools=NUM_CONNECTIONS)
# execute request
response = NO_CERT_POOL.request('GET', url, headers=_determine_headers(config), retries=RETRY_STRATEGY)
except urllib3.exceptions.SSLError:
LOGGER.warning('retrying after SSLError: %s', url)
return _send_urllib_request(url, True, with_headers, config)
except Exception as err:
LOGGER.error('download error: %s %s', url, err) # sys.exc_info()[0]
else:
# necessary for standardization
resp = Response(response.data, response.status, response.geturl())
if with_headers:
resp.store_headers(response.headers)
return resp
# catchall
return None
def _handle_response(url, response, decode, options):
'Internal function to run safety checks on response result.'
lentest = len(response.html or response.data or "")
if response.status != 200:
LOGGER.error('not a 200 response: %s for URL %s', response.status, url)
elif lentest < options.min_file_size:
LOGGER.error('too small/incorrect for URL %s', url)
# raise error instead?
elif lentest > options.max_file_size:
LOGGER.error('too large: length %s for URL %s', lentest, url)
# raise error instead?
else:
return response.html if decode else response
# catchall
return None
[docs]
def fetch_url(url, decode=True, no_ssl=False, config=DEFAULT_CONFIG, options=None):
"""Downloads a web page and seamlessly decodes the response.
Args:
url: URL of the page to fetch.
no_ssl: Don't try to establish a secure connection (to prevent SSLError).
config: Pass configuration values for output control.
options: Extraction options (supersedes config).
Returns:
Unicode string or None in case of failed downloads and invalid results.
"""
if not decode:
warnings.warn(
"""Raw response objects will be deprecated for fetch_url,
use fetch_response instead.""",
PendingDeprecationWarning
)
response = fetch_response(url, decode=decode, no_ssl=no_ssl, config=config)
if response is not None and response != '':
if not options:
options = Extractor(config=config)
return _handle_response(url, response, decode, options)
# return '' (useful do discard further processing?)
# return response
return None
[docs]
def fetch_response(url, *, decode=False, no_ssl=False, with_headers=False, config=DEFAULT_CONFIG):
"""Downloads a web page and returns a full response object.
Args:
url: URL of the page to fetch.
decode: Use html attribute to decode the data (boolean).
no_ssl: Don't try to establish a secure connection (to prevent SSLError).
with_headers: Keep track of the response headers.
config: Pass configuration values for output control.
Returns:
Response object or None in case of failed downloads and invalid results.
"""
dl_function = _send_urllib_request if pycurl is None else _send_pycurl_request
LOGGER.debug('sending request: %s', url)
response = dl_function(url, no_ssl, with_headers, config) # Response
if not response: # None or ""
LOGGER.debug('request failed: %s', url)
return None
response.decode_data(decode)
return response
def _pycurl_is_live_page(url):
"Send a basic HTTP HEAD request with pycurl."
# Initialize pycurl object
curl = pycurl.Curl()
# Set the URL and HTTP method (HEAD)
curl.setopt(pycurl.URL, url.encode('utf-8'))
curl.setopt(pycurl.CONNECTTIMEOUT, 10)
# no SSL verification
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
# Set option to avoid getting the response body
curl.setopt(curl.NOBODY, True)
# Perform the request
try:
curl.perform()
except pycurl.error as err:
LOGGER.debug('pycurl HEAD error: %s %s', url, err)
return False
# Get the response code
page_exists = curl.getinfo(curl.RESPONSE_CODE) < 400
# Clean up
curl.close()
return page_exists
def _urllib3_is_live_page(url):
"Use courlan redirection test (based on urllib3) to send a HEAD request."
try:
_ = redirection_test(url)
except Exception as err:
LOGGER.debug('urllib3 HEAD error: %s %s', url, err)
return False
return True
def is_live_page(url):
"Send a HTTP HEAD request without taking anything else into account."
if pycurl is not None:
return _pycurl_is_live_page(url) or _urllib3_is_live_page(url)
return _urllib3_is_live_page(url)
def add_to_compressed_dict(inputlist, blacklist=None, url_filter=None, url_store=None, compression=False, verbose=False):
'''Filter, convert input URLs and add them to domain-aware processing dictionary'''
if url_store is None:
url_store = UrlStore(
compressed=compression,
strict=False,
verbose=verbose
)
inputlist = list(dict.fromkeys(inputlist))
if blacklist:
inputlist = [u for u in inputlist if URL_BLACKLIST_REGEX.sub('', u) not in blacklist]
if url_filter:
inputlist = [u for u in inputlist if any(f in u for f in url_filter)]
url_store.add_urls(inputlist)
return url_store
def load_download_buffer(url_store, sleep_time=5):
'''Determine threading strategy and draw URLs respecting domain-based back-off rules.'''
bufferlist = []
while not bufferlist:
bufferlist = url_store.get_download_urls(time_limit=sleep_time, max_urls=10**5)
# add emptiness test or sleep?
if not bufferlist:
if url_store.done is True:
break
sleep(sleep_time)
return bufferlist, url_store
def buffered_downloads(bufferlist, download_threads, decode=True, options=None):
'''Download queue consumer, single- or multi-threaded.'''
worker = partial(fetch_url, decode=decode, options=options)
with ThreadPoolExecutor(max_workers=download_threads) as executor:
for chunk in make_chunks(bufferlist, 10000):
future_to_url = {executor.submit(worker, url): url for url in chunk}
for future in as_completed(future_to_url):
# url and download result
yield future_to_url[future], future.result()
def _send_pycurl_request(url, no_ssl, with_headers, config):
'''Experimental function using libcurl and pycurl to speed up downloads'''
# https://github.com/pycurl/pycurl/blob/master/examples/retriever-multi.py
# init
headerbytes = BytesIO()
headers = _determine_headers(config)
headerlist = ['Accept-Encoding: gzip, deflate', 'Accept: */*']
for header, content in headers.items():
headerlist.append(header + ': ' + content)
# prepare curl request
# https://curl.haxx.se/libcurl/c/curl_easy_setopt.html
curl = pycurl.Curl()
curl.setopt(pycurl.URL, url.encode('utf-8'))
# share data
curl.setopt(pycurl.SHARE, CURL_SHARE)
curl.setopt(pycurl.HTTPHEADER, headerlist)
# curl.setopt(pycurl.USERAGENT, '')
curl.setopt(pycurl.FOLLOWLOCATION, 1)
curl.setopt(pycurl.MAXREDIRS, config.getint('DEFAULT', 'MAX_REDIRECTS'))
curl.setopt(pycurl.CONNECTTIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.TIMEOUT, config.getint('DEFAULT', 'DOWNLOAD_TIMEOUT'))
curl.setopt(pycurl.MAXFILESIZE, config.getint('DEFAULT', 'MAX_FILE_SIZE'))
curl.setopt(pycurl.NOSIGNAL, 1)
if no_ssl is True:
curl.setopt(pycurl.SSL_VERIFYPEER, 0)
curl.setopt(pycurl.SSL_VERIFYHOST, 0)
else:
curl.setopt(pycurl.CAINFO, certifi.where())
if with_headers:
curl.setopt(pycurl.HEADERFUNCTION, headerbytes.write)
# TCP_FASTOPEN
# curl.setopt(pycurl.FAILONERROR, 1)
# curl.setopt(pycurl.ACCEPT_ENCODING, '')
# send request
try:
bufferbytes = curl.perform_rb()
except pycurl.error as err:
LOGGER.error('pycurl error: %s %s', url, err)
# retry in case of SSL-related error
# see https://curl.se/libcurl/c/libcurl-errors.html
# errmsg = curl.errstr_raw()
# additional error codes: 80, 90, 96, 98
if no_ssl is False and err.args[0] in (35, 54, 58, 59, 60, 64, 66, 77, 82, 83, 91):
LOGGER.debug('retrying after SSL error: %s %s', url, err)
return _send_pycurl_request(url, True, with_headers, config)
# traceback.print_exc(file=sys.stderr)
# sys.stderr.flush()
return None
# additional info
# ip_info = curl.getinfo(curl.PRIMARY_IP)
resp = Response(bufferbytes, curl.getinfo(curl.RESPONSE_CODE), curl.getinfo(curl.EFFECTIVE_URL))
curl.close()
if with_headers:
respheaders = {}
# https://github.com/pycurl/pycurl/blob/master/examples/quickstart/response_headers.py
for line in headerbytes.getvalue().decode("iso-8859-1", errors="replace").splitlines():
# re.split(r'\r?\n') ?
# This will botch headers that are split on multiple lines...
if ':' not in line:
continue
# Break the header line into header name and value.
name, value = line.split(':', 1)
# Now we can actually record the header name and value.
respheaders[name.strip()] = value.strip() # name.strip().lower() ?
resp.store_headers(respheaders)
return resp