diff options
Diffstat (limited to 'python')
-rw-r--r-- | python/sandcrawler/html_ingest.py | 30 | ||||
-rw-r--r-- | python/sandcrawler/ingest.py | 9 | ||||
-rw-r--r-- | python/sandcrawler/persist.py | 1 |
3 files changed, 23 insertions, 17 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 11909e6..f2819c2 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,7 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict: include_formatting=True, ) if tei_xml: - return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml) + body_txt = teixml_body_text(tei_xml) + word_count = len(body_txt.split()) + return dict(status="success", agent=TRAFILATURA_AGENT, tei_xml=tei_xml, word_count=word_count) elif doc.startswith(b'<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" 2012"http://www.w3.org/TR/html4/loose.dtd">'): # hack for firstmonday.org return html_extract_body_teixml(doc[106:]) @@ -104,8 +106,8 @@ class HtmlMetaRow(pydantic.BaseModel): self.has_teixml, self.has_thumbnail, self.word_count, - self.biblio and json.dumps(self.biblio, sort_keys=True), - self.resources and json.dumps(self.resources, sort_keys=True), + (self.biblio or None) and json.dumps(self.biblio, sort_keys=True), + (self.resources or None) and json.dumps(self.resources, sort_keys=True), ) @@ -154,10 +156,9 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w closest = when and datetime_to_cdx(when) for resource in resources: wayback_resp = wayback_client.lookup_resource(resource['url'], closest=closest) - if not wayback_resp: + if not wayback_resp or wayback_resp.status != 'success': + # TODO: raise a specific exception so we can catch it elsewhere? raise Exception("wayback lookup failed") - # XXX - assert wayback_resp.status == 'success' file_meta = gen_file_metadata(wayback_resp.body) if file_meta['sha1hex'] != wayback_resp.cdx.sha1hex: raise Exception("wayback payload sha1hex mismatch") @@ -167,7 +168,7 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w url=wayback_resp.cdx.url, sha1hex=file_meta['sha1hex'], mimetype=file_meta['mimetype'], - status_code=wayback_resp.cdx.status_code, + status_code=wayback_resp.cdx.status_code or wayback_resp.revisit_cdx.status_code, size=file_meta['size_bytes'], sha256hex=file_meta['sha256hex'], resource_type=resource['type'], @@ -176,7 +177,7 @@ def fetch_html_resources(resources: List[dict], wayback_client: WaybackClient, w return full -def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], tei_xml: Optional[str]) -> str: +def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata], word_count: Optional[int]) -> str: """ This function tries to guess if an HTML document represents one of: @@ -201,15 +202,20 @@ def html_guess_scope(url: str, doc: HTMLParser, biblio: Optional[BiblioMetadata] if "://page-one.live.cf.public.springer.com" in url: return "article-sample" + if "scielo" in url: + if "sci_abstract" in url: + return "landingpage" + if "sci_arttext" in url: + return "article-fulltext" + if biblio and biblio.html_fulltext_url == url: return "article-fulltext" # fallback: guess based word count (arbitrary guesses here) - if not tei_xml: + if word_count == None: return "unknown" - body_txt = teixml_body_text(tei_xml) - word_count = len(body_txt.split()) #print(f" body text word count: {word_count}", file=sys.stderr) + assert word_count is not None if word_count < 20: return "stub" elif word_count > 800: @@ -247,7 +253,7 @@ def run_single(url: str, timestamp: Optional[str] = None, quick_mode: bool = Fal html_doc = HTMLParser(html_resource.body) html_biblio = html_extract_biblio(url, html_doc) html_body = html_extract_body_teixml(html_resource.body) - html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('tei_xml')) + html_scope = html_guess_scope(url, html_doc, html_biblio, html_body.get('word_count')) if html_scope not in ('article-fulltext', 'unknown'): return IngestWebResult( status="wrong-scope", diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index e0778d2..363dfb8 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -343,13 +343,14 @@ class IngestFileWorker(SandcrawlerWorker): html_biblio = html_extract_biblio(resource.terminal_url, html_doc) assert html_biblio html_body = html_extract_body_teixml(resource.body) - html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('tei_xml')) + html_scope = html_guess_scope(resource.terminal_url, html_doc, html_biblio, html_body.get('word_count')) + html_biblio_dict = json.loads(html_biblio.json(exclude_none=True)) if html_scope not in ('article-fulltext', 'unknown'): html_body.pop("tei_xml", None) return dict( status="html-body-wrong-scope", - html_biblio=html_biblio, + html_biblio=html_biblio_dict, html_scope=html_scope, ) @@ -358,7 +359,7 @@ class IngestFileWorker(SandcrawlerWorker): html_body.pop("tei_xml", None) return dict( status="too-many-resources", - html_biblio=html_biblio, + html_biblio=html_biblio_dict, html_scope=html_scope, ) @@ -377,7 +378,7 @@ class IngestFileWorker(SandcrawlerWorker): return dict( html_body=html_body, - html_biblio=json.loads(html_biblio.json(exclude_none=True)), + html_biblio=html_biblio_dict, scope=html_scope, html_resources=[json.loads(r.json(exclude_none=True)) for r in full_resources], ) diff --git a/python/sandcrawler/persist.py b/python/sandcrawler/persist.py index fbd2bdb..f13b1f3 100644 --- a/python/sandcrawler/persist.py +++ b/python/sandcrawler/persist.py @@ -97,7 +97,6 @@ class PersistIngestFileResultWorker(SandcrawlerWorker): self.counts['skip-request-fields'] += 1 return None if raw['ingest_type'] not in ('pdf', 'xml', 'html'): - print(raw['ingest_type']) self.counts['skip-ingest-type'] += 1 return None request = { |