aboutsummaryrefslogtreecommitdiffstats
path: root/python/grobid2json.py
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
committerBryan Newbold <bnewbold@archive.org>2021-10-26 12:54:37 -0700
commit05bd7cbcc62588e431c5efd533189e246b2a997e (patch)
treeabcc707a451e77ea1e8c5ac9a5925b97a4bd139a /python/grobid2json.py
parentf3f424e42f2f4f383103cf80b30a00cfa6cfc179 (diff)
downloadsandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.tar.gz
sandcrawler-05bd7cbcc62588e431c5efd533189e246b2a997e.zip
make fmt
Diffstat (limited to 'python/grobid2json.py')
-rwxr-xr-xpython/grobid2json.py12
1 files changed, 3 insertions, 9 deletions
diff --git a/python/grobid2json.py b/python/grobid2json.py
index b4bfe2b..0d47f36 100755
--- a/python/grobid2json.py
+++ b/python/grobid2json.py
@@ -1,5 +1,4 @@
#!/usr/bin/env python3
-
"""
NB: adapted to work as a library for PDF extraction. Will probably be
re-written eventually to be correct, complete, and robust; this is just a
@@ -76,9 +75,7 @@ def all_authors(elem: Optional[ET.Element]) -> List[Dict[str, Any]]:
def journal_info(elem: ET.Element) -> Dict[str, Any]:
journal = dict()
journal["name"] = elem.findtext(".//{%s}monogr/{%s}title" % (ns, ns))
- journal["publisher"] = elem.findtext(
- ".//{%s}publicationStmt/{%s}publisher" % (ns, ns)
- )
+ journal["publisher"] = elem.findtext(".//{%s}publicationStmt/{%s}publisher" % (ns, ns))
if journal["publisher"] == "":
journal["publisher"] = None
journal["issn"] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
@@ -145,9 +142,7 @@ def teixml2json(content: AnyStr, encumbered: bool = True) -> Dict[str, Any]:
info["grobid_version"] = application_tag.attrib["version"].strip()
info["grobid_timestamp"] = application_tag.attrib["when"].strip()
info["title"] = header.findtext(".//{%s}analytic/{%s}title" % (ns, ns))
- info["authors"] = all_authors(
- header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns))
- )
+ info["authors"] = all_authors(header.find(".//{%s}sourceDesc/{%s}biblStruct" % (ns, ns)))
info["journal"] = journal_info(header)
date = header.find('.//{%s}date[@type="published"]' % ns)
info["date"] = (date is not None) and date.attrib.get("when")
@@ -207,8 +202,7 @@ def main() -> None: # pragma no cover
json.dumps(
teixml2json(content, encumbered=(not args.no_encumbered)),
sort_keys=True,
- )
- )
+ ))
if __name__ == "__main__": # pragma no cover