summaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-13 23:30:53 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-13 23:30:53 -0700
commit192ed94d2f3310913ebba62e24b313a8a4c8b2b2 (patch)
tree045c1a05dd067ce52eeb523be4e798642ec72d51
parent7a8518adae2997a507e21eae6d6a99b25b03c52d (diff)
downloadfatcat-scholar-192ed94d2f3310913ebba62e24b313a8a4c8b2b2.tar.gz
fatcat-scholar-192ed94d2f3310913ebba62e24b313a8a4c8b2b2.zip
ref transform: support more GROBID fields
-rw-r--r--fatcat_scholar/schema.py5
-rw-r--r--fatcat_scholar/transform.py26
2 files changed, 20 insertions, 11 deletions
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index d75dae8..ab58bf3 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -179,11 +179,13 @@ class ScholarDoc(BaseModel):
class RefBiblio(BaseModel):
+ unstructured: Optional[str]
title: Optional[str]
subtitle: Optional[str]
- contrib_raw_names: List[str]
+ contrib_raw_names: Optional[List[str]]
year: Optional[int]
container_name: Optional[str]
+ publisher: Optional[str]
volume: Optional[str]
issue: Optional[str]
pages: Optional[str]
@@ -199,6 +201,7 @@ class RefStructured(BaseModel):
biblio: RefBiblio
release_ident: Optional[str]
work_ident: Optional[str]
+ release_year: Optional[int]
index: Optional[int]
key: Optional[str]
locator: Optional[str]
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index a21abf9..b4b5c8d 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -4,7 +4,7 @@ import datetime
from typing import List, Dict, Optional, Any, Sequence
from dynaconf import settings
-from fatcat_openapi_client import ReleaseEntity, FileEntity, ReleaseRef
+from fatcat_openapi_client import ReleaseEntity, FileEntity
from fatcat_scholar.api_entities import *
from fatcat_scholar.schema import *
@@ -459,23 +459,26 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> Sequence[RefStru
output.append(
RefStructured(
biblio=RefBiblio(
+ unstructured=ref.get("unstructured"),
title=ref.get("title"),
# subtitle
- contrib_raw_names=authors,
+ contrib_raw_names=authors or None,
year=ref_year,
container_name=ref.get("journal"),
+ publisher=ref.get("publisher"),
volume=ref.get("volume"),
issue=ref.get("issue"),
- # pages: Optional[str]
- # doi: Optional[str]
- # pmid: Optional[str]
- # pmcid: Optional[str]
- # arxiv_id: Optional[str]
+ pages=ref.get("pages"),
+ doi=ref.get("doi"),
+ pmid=ref.get("pmid"),
+ pmcid=ref.get("pmcid"),
+ arxiv_id=ref.get("arxiv_id"),
# isbn13: Optional[str]
- url=ref.get("url"),
+ url=clean_url_conservative(ref.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_year=release.year,
index=ref.get("index"),
key=ref.get("id"),
locator=None,
@@ -502,11 +505,13 @@ def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]:
output.append(
RefStructured(
biblio=RefBiblio(
+ unstructured=extra.get("unstructured"),
title=ref.title,
subtitle=extra.get("subtitle"),
- contrib_raw_names=authors,
+ contrib_raw_names=authors or None,
year=ref.year,
container_name=ref.container_name,
+ publisher=extra.get("publisher"),
volume=extra.get("volume"),
issue=extra.get("issue"),
pages=extra.get("pages"),
@@ -515,10 +520,11 @@ def refs_from_release_refs(release: ReleaseEntity) -> Sequence[RefStructured]:
pmcid=extra.get("pmcid"),
arxiv_id=extra.get("arxiv_id"),
isbn13=extra.get("isbn13"),
- url=extra.get("url"),
+ url=clean_url_conservative(extra.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_year=release.year,
index=ref.index,
key=ref.key,
locator=ref.locator,