diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 19:31:14 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-08 19:35:15 -0800 |
commit | ecd36863e607e3c9e71fd91ece44a422f88dbe2e (patch) | |
tree | c9f06dcb7b6a3b1b24fa03b79088110cee811a8b /python | |
parent | 0850b7fe7d5266ee0c4153b3e333d93eff164857 (diff) | |
download | sandcrawler-ecd36863e607e3c9e71fd91ece44a422f88dbe2e.tar.gz sandcrawler-ecd36863e607e3c9e71fd91ece44a422f88dbe2e.zip |
ingest: default to html_biblio for PDF URL extraction
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/ingest.py | 41 |
1 files changed, 17 insertions, 24 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index 2c60b22..2f6be05 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -18,10 +18,7 @@ from sandcrawler.html import extract_fulltext_url from sandcrawler.html_ingest import fetch_html_resources, \ quick_fetch_html_resources, html_guess_scope, html_extract_body_teixml, \ WebResource, html_guess_platform -from sandcrawler.html_metadata import html_extract_fulltext_url, \ - XML_FULLTEXT_PATTERNS, HTML_FULLTEXT_PATTERNS, PDF_FULLTEXT_PATTERNS, \ - BiblioMetadata, html_extract_resources, html_extract_biblio, \ - load_adblock_rules +from sandcrawler.html_metadata import BiblioMetadata, html_extract_resources, html_extract_biblio, load_adblock_rules from sandcrawler.workers import SandcrawlerWorker from sandcrawler.db import SandcrawlerPostgrestClient from sandcrawler.xml import xml_reserialize @@ -607,21 +604,23 @@ class IngestFileWorker(SandcrawlerWorker): pass if ingest_type == "pdf" and html_ish_resource: - # Got landing page or similar + fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body) - # this is the new style of URL extraction - if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url: - fulltext_url = dict( - pdf_url=html_biblio.pdf_fulltext_url, - technique="html_biblio", - ) + # the new style of URL extraction (already computed) + # we aren't quite ready to adopt this for the PDF path (which + # has more complex logic to avoid loops, etc) + #if not fulltext_url and html_biblio and html_biblio.pdf_fulltext_url: + # fulltext_url = dict( + # pdf_url=html_biblio.pdf_fulltext_url, + # technique="html_biblio", + # ) result['extract_next_hop'] = fulltext_url if not fulltext_url: result['status'] = 'no-pdf-link' return result - next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') + next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url') or "" assert next_url next_url = clean_url(next_url) print("[PARSE {:>6}] {} {}".format( @@ -637,12 +636,9 @@ class IngestFileWorker(SandcrawlerWorker): hops.append(next_url) continue elif ingest_type == "xml" and html_ish_resource: - # parse with selectolax, extract XML fulltext URL - html_doc = HTMLParser(resource.body) - extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, XML_FULLTEXT_PATTERNS) - if extract_next_hop: - next_url = extract_next_hop[0] - technique = extract_next_hop[1] + if html_biblio and html_biblio.xml_fulltext_url: + next_url = html_biblio.xml_fulltext_url + technique = "html_biblio" print("[PARSE {:>6}] {} {}".format( ingest_type, technique, @@ -656,12 +652,9 @@ class IngestFileWorker(SandcrawlerWorker): hops.append(next_url) continue elif ingest_type == "html" and html_ish_resource: - # parse with selectolax, extract XML fulltext URL - html_doc = HTMLParser(resource.body) - extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, HTML_FULLTEXT_PATTERNS) - if extract_next_hop: - next_url = extract_next_hop[0] - technique = extract_next_hop[1] + if html_biblio and html_biblio.html_fulltext_url: + next_url = html_biblio.html_fulltext_url + technique = "html_bibli" if next_url in hops: # for HTML ingest, we don't count this as a link-loop break |