diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-14 00:21:05 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-14 00:21:05 -0700 |
commit | c4fcf41cf62d15ead45049e58670fd06eca819b7 (patch) | |
tree | d8eb4bce71bab29be70185f221d05d2224dcaac5 | |
parent | d52cb39476aad977ffe8b73b16e831f78d3ab8fe (diff) | |
download | fatcat-scholar-c4fcf41cf62d15ead45049e58670fd06eca819b7.tar.gz fatcat-scholar-c4fcf41cf62d15ead45049e58670fd06eca819b7.zip |
truncate arXiv: prefix from arxiv_id in GROBID parse
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 2 |
1 files changed, 2 insertions, 0 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 898275b..4c8543a 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -121,6 +121,8 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]: ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) + if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): + ref["arxiv_id"] = ref["arxiv_id"][6:] ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) |