From 5495d2ba4c92cf3ea3f1c31efe9ca670f6900047 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 13 Sep 2021 19:33:08 -0700 Subject: ingest: basic 'component' and 'src' support --- proposals/2021-09-09_component_ingest.md | 114 +++++++++++++++++++++++++++++++ proposals/2021-09-13_src_ingest.md | 53 ++++++++++++++ python/sandcrawler/html_metadata.py | 15 ++++ python/sandcrawler/ingest.py | 89 ++++++++++++++++++------ 4 files changed, 251 insertions(+), 20 deletions(-) create mode 100644 proposals/2021-09-09_component_ingest.md create mode 100644 proposals/2021-09-13_src_ingest.md diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md new file mode 100644 index 0000000..09dee4f --- /dev/null +++ b/proposals/2021-09-09_component_ingest.md @@ -0,0 +1,114 @@ + +File Ingest Mode: 'component' +============================= + +A new ingest type for downloading individual files which are a subset of a +complete work. + +Some publishers now assign DOIs to individual figures, supplements, and other +"components" of an over release or document. + +Initial mimetypes to allow: + +- image/jpeg +- image/tiff +- image/png +- image/gif +- audio/mpeg +- video/mp4 +- video/mpeg +- text/plain +- text/csv +- application/json +- application/xml +- application/pdf +- application/gzip +- application/x-bzip +- application/x-bzip2 +- application/zip +- application/x-rar +- application/x-7z-compressed +- application/x-tar +- application/vnd.ms-powerpoint +- application/vnd.ms-excel +- application/msword +- application/vnd.openxmlformats-officedocument.wordprocessingml.document +- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet + +Intentionally not supporting: + +- text/html + + +## Fatcat Changes + +In the file importer, allow the additional mimetypes for 'component' ingest. + + +## Ingest Changes + +Allow additional terminal mimetypes for 'component' crawls. + + +## Examples + +Hundreds of thousands: + +#### ACS Supplement File + + + +Redirects directly to .zip in browser. SPN is blocked by cookie check. + +#### Frontiers .docx Supplement + + + +Redirects to full article page. There is a pop-up for figshare, seems hard to process. + +#### Figshare Single FIle + + + +As 'component' type in fatcat. + +Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain. + +#### PeerJ supplement file + + + +PeerJ is hard because it redirects to a single HTML page, which has links to +supplements in the HTML. Perhaps a custom extractor will work. + +#### eLife + + + +The current crawl mechanism makes it seemingly impossible to extract a specific +supplement from the document as a whole. + +#### Zookeys + + + +These are extract-able. + +#### OECD PDF Supplement + + + + +Has an Excel (.xls) link, great, but then paywall. + +#### Direct File Link + + + +This one is also OECD, but is a simple direct download. + +#### Protein Data Base (PDB) Entry + + + +Multiple files; dataset/fileset more appropriate for these. diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md new file mode 100644 index 0000000..470827a --- /dev/null +++ b/proposals/2021-09-13_src_ingest.md @@ -0,0 +1,53 @@ + +File Ingest Mode: 'src' +======================= + +Ingest type for "source" of works in document form. For example, tarballs of +LaTeX source and figures, as published on arxiv.org and Pubmed Central. + +For now, presumption is that this would be a single file (`file` entity in +fatcat). + +Initial mimetypes to allow: + +- text/x-tex +- application/xml +- application/gzip +- application/x-bzip +- application/x-bzip2 +- application/zip +- application/x-tar +- application/msword +- application/vnd.openxmlformats-officedocument.wordprocessingml.document + + +## Fatcat Changes + +In the file importer, allow the additional mimetypes for 'src' ingest. + +Might keep ingest disabled on the fatcat side, at least initially. Eg, until +there is some scope of "file scope", or other ways of treating 'src' tarballs +separate from PDFs or other fulltext formats. + + +## Ingest Changes + +Allow additional terminal mimetypes for 'src' crawls. + + +## Examples + + arxiv:2109.00954v1 + fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy + https://arxiv.org/format/2109.00954v1 + https://arxiv.org/e-print/2109.00954v1 + + arxiv:1912.03397v2 + https://arxiv.org/format/1912.03397v2 + https://arxiv.org/e-print/1912.03397v2 + NOT: https://arxiv.org/pdf/1912.03397v2 + + pmcid:PMC3767916 + https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz + +For PMC, will need to use one of the .csv file lists to get the digit prefixes. diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py index 44576e6..1a328ef 100644 --- a/python/sandcrawler/html_metadata.py +++ b/python/sandcrawler/html_metadata.py @@ -259,6 +259,17 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [ }, ] +COMPONENT_FULLTEXT_PATTERNS: List[dict] = [ + { + "in_doc_url": "pensoft.net/article/", # also /element/ + "in_fulltext_url": "/download/fig/", + "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small", + "attr": "href", + "technique": "Active figure download link (zookeys)", + "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/", + }, +] + # This is a database of matching patterns. Most of these discovered by hand, # looking at OA journal content that failed to craw/ingest. PDF_FULLTEXT_PATTERNS: List[dict] = [ @@ -623,6 +634,7 @@ class BiblioMetadata(pydantic.BaseModel): pdf_fulltext_url: Optional[str] html_fulltext_url: Optional[str] xml_fulltext_url: Optional[str] + component_url: Optional[str] class Config: json_encoders = { @@ -705,6 +717,9 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS) if html_fulltext_url: meta['html_fulltext_url'] = html_fulltext_url[0] + component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS) + if component_url: + meta['component_url'] = component_url[0] # TODO: replace with clean_doi() et al if meta.get('doi') and meta.get('doi').startswith('doi:'): diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py index c736878..b852c69 100644 --- a/python/sandcrawler/ingest.py +++ b/python/sandcrawler/ingest.py @@ -173,6 +173,44 @@ class IngestFileWorker(SandcrawlerWorker): "://s3-eu-west-1.amazonaws.com/", ] + self.src_valid_mimetypes = [ + "text/x-tex", + "application/gzip", + "application/x-bzip", + "application/x-bzip2", + "application/zip", + "application/x-tar", + "application/msword", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + ] + + self.component_valid_mimetypes = [ + "image/jpeg", + "image/tiff", + "image/png", + "image/gif", + "audio/mpeg", + "video/mp4", + "video/mpeg", + "text/plain", + "text/csv", + "application/json", + "application/xml", + "application/pdf", + "application/gzip", + "application/x-bzip", + "application/x-bzip2", + "application/zip ", + "application/x-rar ", + "application/x-7z-compressed", + "application/x-tar", + "application/vnd.ms-powerpoint", + "application/vnd.ms-excel", + "application/msword", + "application/vnd.openxmlformats-officedocument.wordprocessingml.document", + "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet", + ] + def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]: """ @@ -284,6 +322,10 @@ class IngestFileWorker(SandcrawlerWorker): if 'html_biblio' in html_info and not html_info['html_biblio']: html_info.pop('html_biblio') return html_info + elif ingest_type == "src": + return {} + elif ingest_type == "component": + return {} else: raise NotImplementedError(f"process {ingest_type} hit") @@ -473,7 +515,7 @@ class IngestFileWorker(SandcrawlerWorker): ) def want(self, request: dict) -> bool: - if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'): + if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html', 'src', 'component'): return False return True @@ -484,7 +526,7 @@ class IngestFileWorker(SandcrawlerWorker): request['ingest_type'] = 'pdf' ingest_type = request.get('ingest_type') - if ingest_type not in ("pdf", "xml", "html"): + if ingest_type not in ("pdf", "xml", "html", "src", "component"): raise NotImplementedError(f"can't handle ingest_type={ingest_type}") # parse/clean URL @@ -508,6 +550,8 @@ class IngestFileWorker(SandcrawlerWorker): best_mimetype = "text/xml" elif ingest_type == "html": best_mimetype = "text/html" + elif ingest_type == "src": + best_mimetype = "application/gzip" existing = self.check_existing_ingest(ingest_type, base_url) if existing: @@ -668,9 +712,18 @@ class IngestFileWorker(SandcrawlerWorker): return result hops.append(next_url) continue - elif ingest_type == "xml" and html_ish_resource: - if html_biblio and html_biblio.xml_fulltext_url: - next_url = html_biblio.xml_fulltext_url + elif ingest_type in ("xml", "html", "component") and html_ish_resource and html_biblio: + # NOTE: src_fulltext_url is not a thing + next_url_found = None + if ingest_type == "xml" and html_biblio.xml_fulltext_url: + next_url_found = html_biblio.xml_fulltext_url + elif ingest_type == "html" and html_biblio.html_fulltext_url: + next_url_found = html_biblio.html_fulltext_url + elif ingest_type == "component" and html_biblio.component_url: + next_url_found = html_biblio.component_url + + if next_url_found: + next_url = next_url_found technique = "html_biblio" print("[PARSE {:>6}] {} {}".format( ingest_type, @@ -679,26 +732,14 @@ class IngestFileWorker(SandcrawlerWorker): ), file=sys.stderr) if next_url in hops: + if ingest_type == "html": + # for HTML ingest, we don't count this as a link-loop + break result['status'] = 'link-loop' result['error_message'] = "repeated: {}".format(next_url) return result hops.append(next_url) continue - elif ingest_type == "html" and html_ish_resource: - if html_biblio and html_biblio.html_fulltext_url: - next_url = html_biblio.html_fulltext_url - technique = "html_biblio" - if next_url in hops: - # for HTML ingest, we don't count this as a link-loop - break - print("[PARSE {:>6}] {} {}".format( - ingest_type, - technique, - next_url, - ), - file=sys.stderr) - hops.append(next_url) - continue # default is to NOT keep hopping break @@ -737,6 +778,14 @@ class IngestFileWorker(SandcrawlerWorker): if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"): result['status'] = "wrong-mimetype" return result + elif ingest_type == "src": + if file_meta['mimetype'] not in self.src_valid_mimetypes: + result['status'] = "wrong-mimetype" + return result + elif ingest_type == "component": + if file_meta['mimetype'] not in self.component_valid_mimetypes: + result['status'] = "wrong-mimetype" + return result else: raise NotImplementedError() -- cgit v1.2.3