aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--proposals/2021-09-09_component_ingest.md114
-rw-r--r--proposals/2021-09-13_src_ingest.md53
-rw-r--r--python/sandcrawler/html_metadata.py15
-rw-r--r--python/sandcrawler/ingest.py89
4 files changed, 251 insertions, 20 deletions
diff --git a/proposals/2021-09-09_component_ingest.md b/proposals/2021-09-09_component_ingest.md
new file mode 100644
index 0000000..09dee4f
--- /dev/null
+++ b/proposals/2021-09-09_component_ingest.md
@@ -0,0 +1,114 @@
+
+File Ingest Mode: 'component'
+=============================
+
+A new ingest type for downloading individual files which are a subset of a
+complete work.
+
+Some publishers now assign DOIs to individual figures, supplements, and other
+"components" of an over release or document.
+
+Initial mimetypes to allow:
+
+- image/jpeg
+- image/tiff
+- image/png
+- image/gif
+- audio/mpeg
+- video/mp4
+- video/mpeg
+- text/plain
+- text/csv
+- application/json
+- application/xml
+- application/pdf
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-rar
+- application/x-7z-compressed
+- application/x-tar
+- application/vnd.ms-powerpoint
+- application/vnd.ms-excel
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+- application/vnd.openxmlformats-officedocument.spreadsheetml.sheet
+
+Intentionally not supporting:
+
+- text/html
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'component' ingest.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'component' crawls.
+
+
+## Examples
+
+Hundreds of thousands: <https://fatcat.wiki/release/search?q=type%3Acomponent+in_ia%3Afalse>
+
+#### ACS Supplement File
+
+<https://doi.org/10.1021/acscatal.0c02627.s002>
+
+Redirects directly to .zip in browser. SPN is blocked by cookie check.
+
+#### Frontiers .docx Supplement
+
+<https://doi.org/10.3389/fpls.2019.01642.s001>
+
+Redirects to full article page. There is a pop-up for figshare, seems hard to process.
+
+#### Figshare Single FIle
+
+<https://doi.org/10.6084/m9.figshare.13646972.v1>
+
+As 'component' type in fatcat.
+
+Redirects to a landing page. Dataset ingest seems more appropriate for this entire domain.
+
+#### PeerJ supplement file
+
+<https://doi.org/10.7717/peerj.10257/supp-7>
+
+PeerJ is hard because it redirects to a single HTML page, which has links to
+supplements in the HTML. Perhaps a custom extractor will work.
+
+#### eLife
+
+<https://doi.org/10.7554/elife.38407.010>
+
+The current crawl mechanism makes it seemingly impossible to extract a specific
+supplement from the document as a whole.
+
+#### Zookeys
+
+<https://doi.org/10.3897/zookeys.895.38576.figure53>
+
+These are extract-able.
+
+#### OECD PDF Supplement
+
+<https://doi.org/10.1787/f08c6324-en>
+<https://www.oecd-ilibrary.org/trade/imports-of-services-billions-of-us-dollars_f08c6324-en>
+
+Has an Excel (.xls) link, great, but then paywall.
+
+#### Direct File Link
+
+<https://doi.org/10.1787/888934207500>
+
+This one is also OECD, but is a simple direct download.
+
+#### Protein Data Base (PDB) Entry
+
+<https://doi.org/10.2210/pdb6ls2/pdb>
+
+Multiple files; dataset/fileset more appropriate for these.
diff --git a/proposals/2021-09-13_src_ingest.md b/proposals/2021-09-13_src_ingest.md
new file mode 100644
index 0000000..470827a
--- /dev/null
+++ b/proposals/2021-09-13_src_ingest.md
@@ -0,0 +1,53 @@
+
+File Ingest Mode: 'src'
+=======================
+
+Ingest type for "source" of works in document form. For example, tarballs of
+LaTeX source and figures, as published on arxiv.org and Pubmed Central.
+
+For now, presumption is that this would be a single file (`file` entity in
+fatcat).
+
+Initial mimetypes to allow:
+
+- text/x-tex
+- application/xml
+- application/gzip
+- application/x-bzip
+- application/x-bzip2
+- application/zip
+- application/x-tar
+- application/msword
+- application/vnd.openxmlformats-officedocument.wordprocessingml.document
+
+
+## Fatcat Changes
+
+In the file importer, allow the additional mimetypes for 'src' ingest.
+
+Might keep ingest disabled on the fatcat side, at least initially. Eg, until
+there is some scope of "file scope", or other ways of treating 'src' tarballs
+separate from PDFs or other fulltext formats.
+
+
+## Ingest Changes
+
+Allow additional terminal mimetypes for 'src' crawls.
+
+
+## Examples
+
+ arxiv:2109.00954v1
+ fatcat:release_akzp2lgqjbcbhpoeoitsj5k5hy
+ https://arxiv.org/format/2109.00954v1
+ https://arxiv.org/e-print/2109.00954v1
+
+ arxiv:1912.03397v2
+ https://arxiv.org/format/1912.03397v2
+ https://arxiv.org/e-print/1912.03397v2
+ NOT: https://arxiv.org/pdf/1912.03397v2
+
+ pmcid:PMC3767916
+ https://ftp.ncbi.nlm.nih.gov/pub/pmc/oa_package/08/03/PMC3767916.tar.gz
+
+For PMC, will need to use one of the .csv file lists to get the digit prefixes.
diff --git a/python/sandcrawler/html_metadata.py b/python/sandcrawler/html_metadata.py
index 44576e6..1a328ef 100644
--- a/python/sandcrawler/html_metadata.py
+++ b/python/sandcrawler/html_metadata.py
@@ -259,6 +259,17 @@ HTML_FULLTEXT_PATTERNS: List[dict] = [
},
]
+COMPONENT_FULLTEXT_PATTERNS: List[dict] = [
+ {
+ "in_doc_url": "pensoft.net/article/", # also /element/
+ "in_fulltext_url": "/download/fig/",
+ "selector": ".Main-Content .figure a.P-Article-Preview-Picture-Download-Small",
+ "attr": "href",
+ "technique": "Active figure download link (zookeys)",
+ "example_page": "https://zookeys.pensoft.net/article/38576/element/2/153/",
+ },
+]
+
# This is a database of matching patterns. Most of these discovered by hand,
# looking at OA journal content that failed to craw/ingest.
PDF_FULLTEXT_PATTERNS: List[dict] = [
@@ -623,6 +634,7 @@ class BiblioMetadata(pydantic.BaseModel):
pdf_fulltext_url: Optional[str]
html_fulltext_url: Optional[str]
xml_fulltext_url: Optional[str]
+ component_url: Optional[str]
class Config:
json_encoders = {
@@ -705,6 +717,9 @@ def html_extract_biblio(doc_url: str, doc: HTMLParser) -> Optional[BiblioMetadat
html_fulltext_url = html_extract_fulltext_url(doc_url, doc, HTML_FULLTEXT_PATTERNS)
if html_fulltext_url:
meta['html_fulltext_url'] = html_fulltext_url[0]
+ component_url = html_extract_fulltext_url(doc_url, doc, COMPONENT_FULLTEXT_PATTERNS)
+ if component_url:
+ meta['component_url'] = component_url[0]
# TODO: replace with clean_doi() et al
if meta.get('doi') and meta.get('doi').startswith('doi:'):
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index c736878..b852c69 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -173,6 +173,44 @@ class IngestFileWorker(SandcrawlerWorker):
"://s3-eu-west-1.amazonaws.com/",
]
+ self.src_valid_mimetypes = [
+ "text/x-tex",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip",
+ "application/x-tar",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ ]
+
+ self.component_valid_mimetypes = [
+ "image/jpeg",
+ "image/tiff",
+ "image/png",
+ "image/gif",
+ "audio/mpeg",
+ "video/mp4",
+ "video/mpeg",
+ "text/plain",
+ "text/csv",
+ "application/json",
+ "application/xml",
+ "application/pdf",
+ "application/gzip",
+ "application/x-bzip",
+ "application/x-bzip2",
+ "application/zip ",
+ "application/x-rar ",
+ "application/x-7z-compressed",
+ "application/x-tar",
+ "application/vnd.ms-powerpoint",
+ "application/vnd.ms-excel",
+ "application/msword",
+ "application/vnd.openxmlformats-officedocument.wordprocessingml.document",
+ "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet",
+ ]
+
def check_existing_ingest(self, ingest_type: str, base_url: str) -> Optional[dict]:
"""
@@ -284,6 +322,10 @@ class IngestFileWorker(SandcrawlerWorker):
if 'html_biblio' in html_info and not html_info['html_biblio']:
html_info.pop('html_biblio')
return html_info
+ elif ingest_type == "src":
+ return {}
+ elif ingest_type == "component":
+ return {}
else:
raise NotImplementedError(f"process {ingest_type} hit")
@@ -473,7 +515,7 @@ class IngestFileWorker(SandcrawlerWorker):
)
def want(self, request: dict) -> bool:
- if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html'):
+ if not request.get('ingest_type') in ('file', 'pdf', 'xml', 'html', 'src', 'component'):
return False
return True
@@ -484,7 +526,7 @@ class IngestFileWorker(SandcrawlerWorker):
request['ingest_type'] = 'pdf'
ingest_type = request.get('ingest_type')
- if ingest_type not in ("pdf", "xml", "html"):
+ if ingest_type not in ("pdf", "xml", "html", "src", "component"):
raise NotImplementedError(f"can't handle ingest_type={ingest_type}")
# parse/clean URL
@@ -508,6 +550,8 @@ class IngestFileWorker(SandcrawlerWorker):
best_mimetype = "text/xml"
elif ingest_type == "html":
best_mimetype = "text/html"
+ elif ingest_type == "src":
+ best_mimetype = "application/gzip"
existing = self.check_existing_ingest(ingest_type, base_url)
if existing:
@@ -668,9 +712,18 @@ class IngestFileWorker(SandcrawlerWorker):
return result
hops.append(next_url)
continue
- elif ingest_type == "xml" and html_ish_resource:
- if html_biblio and html_biblio.xml_fulltext_url:
- next_url = html_biblio.xml_fulltext_url
+ elif ingest_type in ("xml", "html", "component") and html_ish_resource and html_biblio:
+ # NOTE: src_fulltext_url is not a thing
+ next_url_found = None
+ if ingest_type == "xml" and html_biblio.xml_fulltext_url:
+ next_url_found = html_biblio.xml_fulltext_url
+ elif ingest_type == "html" and html_biblio.html_fulltext_url:
+ next_url_found = html_biblio.html_fulltext_url
+ elif ingest_type == "component" and html_biblio.component_url:
+ next_url_found = html_biblio.component_url
+
+ if next_url_found:
+ next_url = next_url_found
technique = "html_biblio"
print("[PARSE {:>6}] {} {}".format(
ingest_type,
@@ -679,26 +732,14 @@ class IngestFileWorker(SandcrawlerWorker):
),
file=sys.stderr)
if next_url in hops:
+ if ingest_type == "html":
+ # for HTML ingest, we don't count this as a link-loop
+ break
result['status'] = 'link-loop'
result['error_message'] = "repeated: {}".format(next_url)
return result
hops.append(next_url)
continue
- elif ingest_type == "html" and html_ish_resource:
- if html_biblio and html_biblio.html_fulltext_url:
- next_url = html_biblio.html_fulltext_url
- technique = "html_biblio"
- if next_url in hops:
- # for HTML ingest, we don't count this as a link-loop
- break
- print("[PARSE {:>6}] {} {}".format(
- ingest_type,
- technique,
- next_url,
- ),
- file=sys.stderr)
- hops.append(next_url)
- continue
# default is to NOT keep hopping
break
@@ -737,6 +778,14 @@ class IngestFileWorker(SandcrawlerWorker):
if file_meta['mimetype'] not in ("text/html", "application/xhtml+xml"):
result['status'] = "wrong-mimetype"
return result
+ elif ingest_type == "src":
+ if file_meta['mimetype'] not in self.src_valid_mimetypes:
+ result['status'] = "wrong-mimetype"
+ return result
+ elif ingest_type == "component":
+ if file_meta['mimetype'] not in self.component_valid_mimetypes:
+ result['status'] = "wrong-mimetype"
+ return result
else:
raise NotImplementedError()