From 4249da27c244406291133453bf209057e29aacef Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 18 Jan 2021 23:47:29 -0800 Subject: grobid2json: add namespace override option for some parsing This is to make the reference parsing code re-usable with the simpler processCitations response, which is an XML fragment with no namespace. Should have no impact on existing code paths. --- fatcat_scholar/grobid2json.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fatcat_scholar/grobid2json.py b/fatcat_scholar/grobid2json.py index 4c8543a..e94bed2 100755 --- a/fatcat_scholar/grobid2json.py +++ b/fatcat_scholar/grobid2json.py @@ -35,7 +35,7 @@ xml_ns = "http://www.w3.org/XML/1998/namespace" ns = "http://www.tei-c.org/ns/1.0" -def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]: +def all_authors(elem: Optional[ET.Element], ns: str = ns) -> List[Dict[str, Any]]: if not elem: return [] names = [] @@ -96,7 +96,7 @@ def journal_info(elem: ET.Element) -> Dict[str, Any]: return journal -def biblio_info(elem: ET.Element) -> Dict[str, Any]: +def biblio_info(elem: ET.Element, ns: str = ns) -> Dict[str, Any]: ref: Dict[str, Any] = dict() ref["id"] = elem.attrib.get("{http://www.w3.org/XML/1998/namespace}id") ref["unstructured"] = elem.findtext('.//{%s}note[@type="raw_reference"]' % ns) @@ -109,7 +109,7 @@ def biblio_info(elem: ET.Element) -> Dict[str, Any]: else: ref["journal"] = None ref["title"] = other_title - ref["authors"] = all_authors(elem) + ref["authors"] = all_authors(elem, ns=ns) ref["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) if not ref["publisher"]: ref["publisher"] = elem.findtext(".//{%s}imprint/{%s}publisher" % (ns, ns)) -- cgit v1.2.3