aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-14 00:21:05 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-14 00:21:05 -0700
commitc4fcf41cf62d15ead45049e58670fd06eca819b7 (patch)
treed8eb4bce71bab29be70185f221d05d2224dcaac5
parentd52cb39476aad977ffe8b73b16e831f78d3ab8fe (diff)
downloadfatcat-scholar-c4fcf41cf62d15ead45049e58670fd06eca819b7.tar.gz
fatcat-scholar-c4fcf41cf62d15ead45049e58670fd06eca819b7.zip
truncate arXiv: prefix from arxiv_id in GROBID parse
-rwxr-xr-xfatcat_scholar/grobid2json.py2
1 files changed, 2 insertions, 0 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index 898275b..4c8543a 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -121,6 +121,8 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]:
ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns)
ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns)
+ if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"):
+ ref["arxiv_id"] = ref["arxiv_id"][6:]
ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns)
ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns)
el = elem.find('.//{%s}biblScope[@unit="page"]' % ns)