small grobid2json test

author: Bryan Newbold <bnewbold@archive.org> 2018-04-06 12:39:49 -0700
committer: Bryan Newbold <bnewbold@archive.org> 2018-04-06 12:39:49 -0700
commit: 114c6b611148d2ff499bcea302eee0eca00df647 (patch)
tree: df929050d3aa9484f78e5c1807bc951ce1e85512
parent: e68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
download: sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz
sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip
4 files changed, 164 insertions, 2 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index a4a13f8..63f290a 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -61,6 +61,7 @@ class MRExtractCdxGrobid(MRJob):
         r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
             files={'input': content})
         if r.status_code is not 200:
+            # if invalid file, get a 400 with JSON body with 'description' key (and others)
             # XXX:
             return None
         return r
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index cc6eb2c..52a3125 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -1,6 +1,10 @@
 #!/usr/bin/env python3
 
 """
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
 This script tries to extract everything from a GROBID TEI XML fulltext dump:
 
 - header metadata
@@ -38,6 +42,8 @@ def journal_info(elem):
     journal = dict()
     journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
     journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if journal['publisher'] == '':
+        journal['publisher'] = None
     journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
     journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
     journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -59,6 +65,8 @@ def biblio_info(elem):
             ref['title'] = other_title
     ref['authors'] = all_authors(elem)
     ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+    if ref['publisher'] == '':
+        ref['publisher'] = None
     date = elem.find('.//{%s}date[@type="published"]' % ns)
     ref['date'] = (date != None) and date.attrib.get('when')
     ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -74,7 +82,7 @@ def biblio_info(elem):
     return ref
 
 
-def do_tei(content, encumbered=True):
+def teixml2json(content, encumbered=True):
 
     if type(content) == str:
         content = io.StringIO(content)
@@ -131,7 +139,7 @@ def main():   # pragma no cover
     for filename in args.teifiles:
         content = open(filename, 'r')
         print(json.dumps(
-            do_tei(content,
+            teixml2json(content,
                encumbered=(not args.no_encumbered))))
 
 if __name__=='__main__':   # pragma no cover
diff --git a/mapreduce/tests/files/small.json b/mapreduce/tests/files/small.json
new file mode 100644
index 0000000..208fb49
--- /dev/null
+++ b/mapreduce/tests/files/small.json
@@ -0,0 +1,43 @@
+{
+  "title": "Dummy Example File",
+  "authors": [
+    {"name": "Brewster Kahle"},
+    {"name": "J Doe"}
+  ],
+  "journal": {
+    "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+    "eissn": null,
+    "issn": null,
+    "issue": null,
+    "publisher": null,
+    "volume": null
+  },
+  "date": "2000",
+  "doi": null,
+  "citations": [
+    { "authors": [{"name": "A Seaperson"}],
+      "date": "2001",
+      "id": "b0",
+      "index": 0,
+      "issue": null,
+      "journal": "Letters in the Alphabet",
+      "publisher": null,
+      "title": "Everything is Wonderful",
+      "url": null,
+      "volume": "20"},
+    { "authors": [],
+      "date": "2011-03-28",
+      "id": "b1",
+      "index": 1,
+      "issue": null,
+      "journal": "The Dictionary",
+      "publisher": null,
+      "title": "All about Facts",
+      "url": null,
+      "volume": "14"}
+  ],
+  "abstract": "Everything you ever wanted to know about nothing",
+  "body": "Introduction \nEverything starts somewhere, as somebody [1]  once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+  "acknowledgement": null,
+  "annex": null
+}
diff --git a/mapreduce/tests/files/small.xml b/mapreduce/tests/files/small.xml
new file mode 100644
index 0000000..78b9ba2
--- /dev/null
+++ b/mapreduce/tests/files/small.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0" 
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" 
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+	<teiHeader xml:lang="en">
+		<encodingDesc>
+			<appInfo>
+				<application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+					<ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+				</application>
+			</appInfo>
+		</encodingDesc>
+		<fileDesc>
+			<titleStmt>
+				<title level="a" type="main">Dummy Example File</title>
+			</titleStmt>
+			<publicationStmt>
+				<publisher/>
+				<availability status="unknown"><licence/></availability>
+				<date type="published" when="2000">2000</date>
+			</publicationStmt>
+			<sourceDesc>
+				<biblStruct>
+					<analytic>
+						<author>
+							<persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Brewster</forename><surname>Kahle</surname></persName>
+						</author>
+						<author>
+							<persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Doe</surname></persName>
+						</author>
+						<author>
+							<affiliation key="aff0">
+								<orgName type="institution">Internet Archive</orgName>
+							</affiliation>
+						</author>
+						<title level="a" type="main">Dummy Example File</title>
+					</analytic>
+					<monogr>
+						<title level="m">Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678</title>
+						<imprint>
+							<date type="published" when="2000">2000</date>
+						</imprint>
+					</monogr>
+				</biblStruct>
+			</sourceDesc>
+		</fileDesc>
+		<profileDesc>
+			<textClass>
+				<keywords>
+					<term>Fake Data</term>
+				</keywords>
+			</textClass>
+			<abstract>
+				<p>Everything you ever wanted to know about nothing</p>
+			</abstract>
+		</profileDesc>
+	</teiHeader>
+	<text xml:lang="en">
+		<body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>
+Everything starts somewhere, as somebody<ref type="bibr" target="#b0">[1]</ref> once said.</p></div>
+
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">In Depth</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Meat</head><p>
+You know, for kids.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Potatos</head><p>
+QED.</p></div>
+		</body>
+		<back>
+			<div type="references">
+
+				<listBibl>
+
+<biblStruct xml:id="b0">
+	<analytic>
+		<title level="a" type="main">Everything is Wonderful</title>
+		<author>
+			<persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">A</forename><surname>Seaperson</surname></persName>
+		</author>
+	</analytic>
+	<monogr>
+		<title level="j">Letters in the Alphabet</title>
+		<imprint>
+			<biblScope unit="volume">20</biblScope>
+			<biblScope unit="page" from="1" to="11" />
+			<date type="published" when="2001" />
+		</imprint>
+	</monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+	<analytic>
+		<title level="a" type="main">All about Facts</title>
+	</analytic>
+	<monogr>
+		<title level="j">The Dictionary</title>
+		<imprint>
+			<biblScope unit="volume">14</biblScope>
+			<date type="published" when="2011-03-28" />
+		</imprint>
+	</monogr>
+	<note>None</note>
+</biblStruct>
+
+				</listBibl>
+			</div>
+		</back>
+	</text>
+</TEI>
author	Bryan Newbold <bnewbold@archive.org>	2018-04-06 12:39:49 -0700
committer	Bryan Newbold <bnewbold@archive.org>	2018-04-06 12:39:49 -0700
commit	114c6b611148d2ff499bcea302eee0eca00df647 (patch)
tree	df929050d3aa9484f78e5c1807bc951ce1e85512
parent	e68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
download	sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip