diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:32:17 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-09-13 23:32:17 -0700 |
commit | 2e659b6bad1ab429d36fcd8cb1a686eab81e6d89 (patch) | |
tree | 920c49c1dad5eed51329714e686b45cf754790c7 | |
parent | 192ed94d2f3310913ebba62e24b313a8a4c8b2b2 (diff) | |
download | fatcat-scholar-2e659b6bad1ab429d36fcd8cb1a686eab81e6d89.tar.gz fatcat-scholar-2e659b6bad1ab429d36fcd8cb1a686eab81e6d89.zip |
grobid2json: extract more reference biblio fields
-rwxr-xr-x | fatcat_scholar/grobid2json.py | 23 |
1 files changed, 17 insertions, 6 deletions
diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index fc19036..dcf9ce8 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -97,14 +97,9 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: def biblio_info(elem: ET.Element) -> Dict[str, Any]: - """ - TODO for references: - - pages - - locator - - doi, pmid, pmcid, arxiv_id, isbn - """ ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") + ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) # Title stuff is messy in references... ref["title"] = elem.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) other_title = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) @@ -116,18 +111,34 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]: ref["title"] = other_title ref["authors"] = all_authors(elem) ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) + if not ref["publisher"]: + ref["publisher"] = elem.findtext(".//{%s}imprint/{%s}publisher" % (ns, ns)) if ref["publisher"] == "": ref["publisher"] = None date = elem.find('.//{%s}date[@type="published"]' % ns) ref["date"] = (date is not None) and date.attrib.get("when") ref["volume"] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns) ref["issue"] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns) + ref["doi"] = elem.findtext('.//{%s}idno[@type="DOI"]' % ns) + ref["arxiv_id"] = elem.findtext('.//{%s}idno[@type="arXiv"]' % ns) + ref["pmcid"] = elem.findtext('.//{%s}idno[@type="PMCID"]' % ns) + ref["pmid"] = elem.findtext('.//{%s}idno[@type="PMID"]' % ns) + el = elem.find(".//{%s}biblScope[@page]" % ns) + if el is not None: + if el.attrib.get("from") and el.attrib.get("to"): + ref["pages"] = "{}-{}".format(el.attrib["from"], el.attrib["to"]) + else: + ref["pages"] = el.text el = elem.find(".//{%s}ptr[@target]" % ns) if el is not None: ref["url"] = el.attrib["target"] # Hand correction if ref["url"].endswith(".Lastaccessed"): ref["url"] = ref["url"].replace(".Lastaccessed", "") + if ref["url"].startswith("<"): + ref["url"] = ref["url"][1:] + if ">" in ref["url"]: + ref["url"] = ref["url"].split(">")[0] else: ref["url"] = None return ref |