aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-03 17:16:39 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-03 17:16:39 -0800
commitab1c3cb70a9bc13ab2ff971a701aa9615c73d205 (patch)
tree72ef5a1f1325815490ab008bcaa3a25a3e74e680 /python
parent84a44a8d98c2d9872af0962021b19def7173e8ba (diff)
downloadsandcrawler-ab1c3cb70a9bc13ab2ff971a701aa9615c73d205.tar.gz
sandcrawler-ab1c3cb70a9bc13ab2ff971a701aa9615c73d205.zip
basic support for XML ingest in worker
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/ingest.py63
1 files changed, 40 insertions, 23 deletions
diff --git a/python/sandcrawler/ingest.py b/python/sandcrawler/ingest.py
index 633e856..7ad0124 100644
--- a/python/sandcrawler/ingest.py
+++ b/python/sandcrawler/ingest.py
@@ -8,12 +8,14 @@ import requests
from typing import Optional, Tuple, Any, Dict
from http.server import BaseHTTPRequestHandler, HTTPServer
from collections import namedtuple
+from selectolax.parser import HTMLParser
from sandcrawler.ia import SavePageNowClient, CdxApiClient, WaybackClient, WaybackError, WaybackContentError, SavePageNowError, CdxApiError, PetaboxError, cdx_to_dict, ResourceResult, fix_transfer_encoding
from sandcrawler.grobid import GrobidClient
from sandcrawler.pdfextract import process_pdf, PdfExtractResult
from sandcrawler.misc import gen_file_metadata, clean_url
from sandcrawler.html import extract_fulltext_url
+from sandcrawler.html_metadata import html_extract_fulltext_url, XML_FULLTEXT_PATTERNS
from sandcrawler.workers import SandcrawlerWorker
from sandcrawler.db import SandcrawlerPostgrestClient
@@ -241,7 +243,7 @@ class IngestFileWorker(SandcrawlerWorker):
}
elif ingest_type == "xml":
# TODO
- raise NotImplementedError(f"process {ingest_type} hit")
+ return {}
else:
raise NotImplementedError(f"process {ingest_type} hit")
@@ -441,39 +443,50 @@ class IngestFileWorker(SandcrawlerWorker):
result['status'] = 'null-body'
return result
- # here is where we split based on ingest type
+ # here we split based on ingest type to try and extract a next hop
html_ish_resource = bool(
"html" in file_meta['mimetype']
or "xhtml" in file_meta['mimetype']
or "application/xml" in file_meta['mimetype']
or "text/xml" in file_meta['mimetype']
)
- if ingest_type == "pdf":
- if html_ish_resource:
- # Got landing page or similar. Some XHTML detected as "application/xml"
- fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
-
- result['html'] = fulltext_url
- if not fulltext_url:
- result['status'] = 'no-pdf-link'
- return result
- next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
- assert next_url
- next_url = clean_url(next_url)
+ if ingest_type == "pdf" and html_ish_resource:
+ # Got landing page or similar. Some XHTML detected as "application/xml"
+ fulltext_url = extract_fulltext_url(resource.terminal_url, resource.body)
+ result['extract_next_hop'] = fulltext_url
+
+ if not fulltext_url:
+ result['status'] = 'no-pdf-link'
+ return result
+ next_url = fulltext_url.get('pdf_url') or fulltext_url.get('next_url')
+ assert next_url
+ next_url = clean_url(next_url)
+ print("[PARSE {:>6}] {} {}".format(
+ ingest_type,
+ fulltext_url.get('technique'),
+ next_url,
+ ),
+ file=sys.stderr)
+ if next_url in hops:
+ result['status'] = 'link-loop'
+ result['error_message'] = "repeated: {}".format(next_url)
+ return result
+ hops.append(next_url)
+ continue
+ elif ingest_type == "xml" and html_ish_resource:
+ # parse with selectolax, extract XML fulltext URL
+ html_doc = HTMLParser(resource.body)
+ extract_next_hop = html_extract_fulltext_url(resource.terminal_url, html_doc, XML_FULLTEXT_PATTERNS)
+ if extract_next_hop:
+ next_url = extract_next_hop[0]
+ technique = extract_next_hop[1]
print("[PARSE {:>6}] {} {}".format(
ingest_type,
- fulltext_url.get('technique'),
+ technique,
next_url,
),
file=sys.stderr)
- if next_url in hops:
- result['status'] = 'link-loop'
- result['error_message'] = "repeated: {}".format(next_url)
- return result
- hops.append(next_url)
continue
- else:
- raise NotImplementedError()
# default is to NOT keep hopping
break
@@ -501,9 +514,13 @@ class IngestFileWorker(SandcrawlerWorker):
result['revisit_cdx'] = cdx_to_dict(resource.revisit_cdx)
if ingest_type == "pdf":
- if not file_meta['mimetype'] == "application/pdf":
+ if file_meta['mimetype'] != "application/pdf":
result['status'] = "wrong-mimetype" # formerly: "other-mimetype"
return result
+ elif ingest_type == "xml":
+ if file_meta['mimetype'] not in ("application/xml", "text/xml", "application/jats+xml"):
+ result['status'] = "wrong-mimetype"
+ return result
else:
raise NotImplementedError()