summaryrefslogtreecommitdiffstats
path: root/fatcat_scholar
diff options
context:
space:
mode:
Diffstat (limited to 'fatcat_scholar')
-rw-r--r--fatcat_scholar/identifiers.py16
-rw-r--r--fatcat_scholar/transform.py5
2 files changed, 9 insertions, 12 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
index 583c8e6..7572e20 100644
--- a/fatcat_scholar/identifiers.py
+++ b/fatcat_scholar/identifiers.py
@@ -24,16 +24,10 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
return None
if len(raw.split()) != 1:
return None
- if raw.startswith("doi:"):
- raw = raw[4:]
- if raw.startswith("http://"):
- raw = raw[7:]
- if raw.startswith("https://"):
- raw = raw[8:]
- if raw.startswith("doi.org/"):
- raw = raw[8:]
- if raw.startswith("dx.doi.org/"):
- raw = raw[11:]
+ if not "10." in raw:
+ return None
+ if not raw.startswith("10."):
+ raw = raw[raw.find("10."):]
if raw[7:9] == "//":
raw = raw[:8] + raw[9:]
@@ -75,6 +69,8 @@ def test_clean_doi() -> None:
assert (
clean_doi("10.6002/ect.2020.häyry") == None
) # this example via pubmed (pmid:32519616)
+ # GROBID mangled DOI
+ assert clean_doi("21924DOI10.1234/asdf ") == "10.1234/asdf"
def clean_pmcid(raw: Optional[str]) -> Optional[str]:
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index 388f2f5..f9616c4 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -12,6 +12,7 @@ from fatcat_scholar.api_entities import *
from fatcat_scholar.schema import *
from fatcat_scholar.config import settings, GIT_REVISION
from fatcat_scholar.grobid2json import teixml2json
+from fatcat_scholar.identifiers import clean_doi, clean_pmcid
MAX_BODY_CHARS = 512 * 1024
@@ -631,9 +632,9 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
volume=ref.get("volume"),
issue=ref.get("issue"),
pages=ref.get("pages"),
- doi=ref.get("doi"),
+ doi=clean_doi(ref.get("doi")),
pmid=ref.get("pmid"),
- pmcid=ref.get("pmcid"),
+ pmcid=clean_pmcid(ref.get("pmcid")),
arxiv_id=ref.get("arxiv_id"),
# isbn13: Optional[str]
url=clean_url_conservative(ref.get("url")),