From c4fcf41cf62d15ead45049e58670fd06eca819b7 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 14 Sep 2020 00:21:05 -0700 Subject: truncate arXiv: prefix from arxiv_id in GROBID parse --- fatcat_scholar/grobid2json.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 898275b..4c8543a 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -121,6 +121,8 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]: ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) + if ref["arxiv_id"] and ref["arxiv_id"].startswith("arXiv:"): + ref["arxiv_id"] = ref["arxiv_id"][6:] ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) -- cgit v1.2.3