aboutsummaryrefslogtreecommitdiffstats
path: root/fatcat_scholar/transform.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-07-01 16:36:27 -0700
committerBryan Newbold <bnewbold@archive.org>2021-07-01 16:36:27 -0700
commit3b4d87826eea9b827fdd8b824e6cd738d89db5bd (patch)
tree985f29c7c21bb732209ca66fa58123125248940f /fatcat_scholar/transform.py
parent5a1d53e8705d5ea59ea0c007f8a53940a353000b (diff)
downloadfatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.tar.gz
fatcat-scholar-3b4d87826eea9b827fdd8b824e6cd738d89db5bd.zip
refs: clean up GROBID DOIs and PMCIDs
Diffstat (limited to 'fatcat_scholar/transform.py')
-rw-r--r--fatcat_scholar/transform.py5
1 files changed, 3 insertions, 2 deletions
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 388f2f5..f9616c4 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -12,6 +12,7 @@ from fatcat_scholar.api_entities import *
from fatcat_scholar.schema import *
from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.grobid2json import teixml2json
+from fatcat_scholar.identifiers import clean_doi, clean_pmcid
MAX_BODY_CHARS = 512 * 1024
@@ -631,9 +632,9 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
volume=ref.get("volume"),
issue=ref.get("issue"),
pages=ref.get("pages"),
- doi=ref.get("doi"),
+ doi=clean_doi(ref.get("doi")),
pmid=ref.get("pmid"),
- pmcid=ref.get("pmcid"),
+ pmcid=clean_pmcid(ref.get("pmcid")),
arxiv_id=ref.get("arxiv_id"),
# isbn13: Optional[str]
url=clean_url_conservative(ref.get("url")),