aboutsummaryrefslogtreecommitdiffstats
path: root/python
diff options
context:
space:
mode:
Diffstat (limited to 'python')
-rw-r--r--python/sandcrawler/html_ingest.py4
-rwxr-xr-xpython/scripts/doaj2ingestrequest.py2
2 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py
index 7594365..91b9cd6 100644
--- a/python/sandcrawler/html_ingest.py
+++ b/python/sandcrawler/html_ingest.py
@@ -25,9 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict:
include_comments=False,
include_formatting=True,
)
- except (ValueError, TypeError) as e:
+ except (ValueError, TypeError, Exception) as e:
return dict(
- status="parse-error",
+ status="trafilatura-parse-error",
error_msg=str(e)[:1000],
)
if tei_xml:
diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py
index f1bae8c..b981ab6 100755
--- a/python/scripts/doaj2ingestrequest.py
+++ b/python/scripts/doaj2ingestrequest.py
@@ -74,7 +74,7 @@ def transform(obj: dict) -> List[dict]:
doi: Optional[str] = None
for ident in (bibjson['identifier'] or []):
- if ident['type'].lower() == "doi" and ident['id'].startswith('10.'):
+ if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'):
doi = ident['id'].lower()
for link in (bibjson['link'] or []):