diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-11-10 14:16:34 -0800 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-11-10 14:16:34 -0800 |
commit | 8e52c019bac85db1bcec743435fed4d346247959 (patch) | |
tree | 3ee49a0738aef30a026589604e79c5cd1c165da7 | |
parent | 4dcbeb2d044041bd8cfb169374474e8f80ebf989 (diff) | |
download | sandcrawler-8e52c019bac85db1bcec743435fed4d346247959.tar.gz sandcrawler-8e52c019bac85db1bcec743435fed4d346247959.zip |
DOAJ and HTML ingest tweaks from QA run
-rw-r--r-- | python/sandcrawler/html_ingest.py | 4 | ||||
-rwxr-xr-x | python/scripts/doaj2ingestrequest.py | 2 |
2 files changed, 3 insertions, 3 deletions
diff --git a/python/sandcrawler/html_ingest.py b/python/sandcrawler/html_ingest.py index 7594365..91b9cd6 100644 --- a/python/sandcrawler/html_ingest.py +++ b/python/sandcrawler/html_ingest.py @@ -25,9 +25,9 @@ def html_extract_body_teixml(doc: bytes) -> dict: include_comments=False, include_formatting=True, ) - except (ValueError, TypeError) as e: + except (ValueError, TypeError, Exception) as e: return dict( - status="parse-error", + status="trafilatura-parse-error", error_msg=str(e)[:1000], ) if tei_xml: diff --git a/python/scripts/doaj2ingestrequest.py b/python/scripts/doaj2ingestrequest.py index f1bae8c..b981ab6 100755 --- a/python/scripts/doaj2ingestrequest.py +++ b/python/scripts/doaj2ingestrequest.py @@ -74,7 +74,7 @@ def transform(obj: dict) -> List[dict]: doi: Optional[str] = None for ident in (bibjson['identifier'] or []): - if ident['type'].lower() == "doi" and ident['id'].startswith('10.'): + if ident['type'].lower() == "doi" and ident.get('id') and ident['id'].startswith('10.'): doi = ident['id'].lower() for link in (bibjson['link'] or []): |