aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-11-10 14:16:34 -0800
committerBryan Newbold <bnewbold@archive.org>2020-11-10 14:16:34 -0800
commit8e52c019bac85db1bcec743435fed4d346247959 (patch)
tree3ee49a0738aef30a026589604e79c5cd1c165da7
parent4dcbeb2d044041bd8cfb169374474e8f80ebf989 (diff)
downloadsandcrawler-8e52c019bac85db1bcec743435fed4d346247959.tar.gz
sandcrawler-8e52c019bac85db1bcec743435fed4d346247959.zip
DOAJ and HTML ingest tweaks from QA run
-rw-r--r--python/sandcrawler/html_ingest.py4
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py2
2 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 7594365..91b9cd6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,9 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict:
include_comments=False,
include_formatting=True,
)
- except (ValueError, TypeError) as e:
+ except (ValueError, TypeError, Exception) as e:
return dict(
- status="parse-error",
+ status="trafilatura-parse-error",
error_msg=str(e)[:1000],
)
if tei_xml:
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index f1bae8c..b981ab6 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -74,7 +74,7 @@ def transform(obj: dict) -> List[dict]:
doi: Optional[str] = None
for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident['id'].startswith('10.'):
+ if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
doi = ident['id'].lower()
for link in (bibjson['link'] or []):