aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-09-13 23:32:17 -0700
committerBryan Newbold <bnewbold@archive.org>2020-09-13 23:32:17 -0700
commit2e659b6bad1ab429d36fcd8cb1a686eab81e6d89 (patch)
tree920c49c1dad5eed51329714e686b45cf754790c7
parent192ed94d2f3310913ebba62e24b313a8a4c8b2b2 (diff)
downloadfatcat-scholar-2e659b6bad1ab429d36fcd8cb1a686eab81e6d89.tar.gz
fatcat-scholar-2e659b6bad1ab429d36fcd8cb1a686eab81e6d89.zip
grobid2json: extract more reference biblio fields
-rwxr-xr-xfatcat_scholar/grobid2json.py23
1 files changed, 17 insertions, 6 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py
index fc19036..dcf9ce8 100755
--- a/fatcat_scholar/grobid2json.py
+++ b/fatcat_scholar/grobid2json.py
@@ -97,14 +97,9 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]:
def biblio_info(elem: ET.Element) -> Dict[str, Any]:
- """
- TODO for references:
- - pages
- - locator
- - doi, pmid, pmcid, arxiv_id, isbn
- """
ref: Dict[str, Any] = dict()
ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id")
+ ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns)
# Title stuff is messy in references...
ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
@@ -116,18 +111,34 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]:
ref["title"] = other_title
ref["authors"] = all_authors(elem)
ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
+ if not ref["publisher"]:
+ ref["publisher"] = elem.findtext(".//{%s}imprint/{%s}publisher" % (ns, ns))
if ref["publisher"] == "":
ref["publisher"] = None
date = elem.find('.//{%s}date[@type="published"]' % ns)
ref["date"] = (date is not None) and date.attrib.get("when")
ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns)
+ ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns)
+ ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns)
+ ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns)
+ el = elem.find(".//{%s}biblScope[@page]" % ns)
+ if el is not None:
+ if el.attrib.get("from") and el.attrib.get("to"):
+ ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"])
+ else:
+ ref["pages"] = el.text
el = elem.find(".//{%s}ptr[@target]" % ns)
if el is not None:
ref["url"] = el.attrib["target"]
# Hand correction
if ref["url"].endswith(".Lastaccessed"):
ref["url"] = ref["url"].replace(".Lastaccessed", "")
+ if ref["url"].startswith("<"):
+ ref["url"] = ref["url"][1:]
+ if ">" in ref["url"]:
+ ref["url"] = ref["url"].split(">")[0]
else:
ref["url"] = None
return ref