diff options
author | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2021-10-26 12:54:37 -0700 |
commit | 05bd7cbcc62588e431c5efd533189e246b2a997e (patch) | |
tree | abcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/grobid2json.py | |
parent | f3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff) | |
download | sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip |
make fmt
Diffstat (limited to 'python/grobid2json.py')
-rwxr-xr-x | python/grobid2json.py | 12 |
1 files changed, 3 insertions, 9 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py index b4bfe2b..0d47f36 100755 --- a/python/grobid2json.py +++ b/python/grobid2json.py @@ -1,5 +1,4 @@ #!/usr/bin/env python3 - """ NB: adapted to work as a library for PDF extraction. Will probably be re-written eventually to be correct, complete, and robust; this is just a @@ -76,9 +75,7 @@ def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]: def journal_info(elem: ET.Element) -> Dict[str, Any]: journal = dict() journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns)) - journal["publisher"] = elem.findtext( - ".//{%s}publicationStmt/{%s}publisher" % (ns, ns) - ) + journal["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns)) if journal["publisher"] == "": journal["publisher"] = None journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns) @@ -145,9 +142,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]: info["grobid_version"] = application_tag.attrib["version"].strip() info["grobid_timestamp"] = application_tag.attrib["when"].strip() info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns)) - info["authors"] = all_authors( - header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)) - ) + info["authors"] = all_authors(header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))) info["journal"] = journal_info(header) date = header.find('.//{%s}date[@type="published"]' % ns) info["date"] = (date is not None) and date.attrib.get("when") @@ -207,8 +202,7 @@ def main() -> None: # pragma no cover json.dumps( teixml2json(content, encumbered=(not args.no_encumbered)), sort_keys=True, - ) - ) + )) if __name__ == "__main__": # pragma no cover |