diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-14 00:16:22 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-14 00:16:22 -0700 |
commit | d52cb39476aad977ffe8b73b16e831f78d3ab8fe (patch) | |
tree | 36902f6a0a4ff8b9d20ff4342320178d7c192396 /fatcat_scholar | |
parent | 71ce30e7547871cb6fe02fa4237af735bd6b9c49 (diff) | |
download | fatcat-scholar-d52cb39476aad977ffe8b73b16e831f78d3ab8fe.tar.gz fatcat-scholar-d52cb39476aad977ffe8b73b16e831f78d3ab8fe.zip |
refs and grobid2json bugfixes from testing
Diffstat (limited to 'fatcat_scholar')
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 13 |
2 files changed, 11 insertions, 4 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index dcf9ce8..898275b 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -123,7 +123,7 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]: ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) - el = elem.find(".//{%s}biblScope[@page]" % ns) + el = elem.find('.//{%s}biblScope[@unit="page"]' % ns) if el is not None: if el.attrib.get("from") and el.attrib.get("to"): ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index 7ff30fe..af794e6 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -452,10 +452,17 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> Sequence[RefStru for ref in tei_dict.get("citations") or []: ref_date = ref.get("date") or None ref_year: Optional[int] = None - if ref_date and len(ref_date) > 4 and ref_date[:4].isdigit(): + if ref_date and len(ref_date) >= 4 and ref_date[:4].isdigit(): ref_year = int(ref_date[:4]) - authors = ref.get("authors") or [] - authors = [a for a in authors if type(a) == str] + ref_authors = ref.get("authors") or [] + authors: List[str] = [] + for a in ref_authors: + if isinstance(a, str): + authors.append(a) + elif isinstance(a, dict): + if a.get("name"): + assert isinstance(a["name"], str) + authors.append(a["name"]) output.append( RefStructured( biblio=RefBiblio( |