aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
commit114c6b611148d2ff499bcea302eee0eca00df647 (patch)
treedf929050d3aa9484f78e5c1807bc951ce1e85512 /mapreduce
parente68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
downloadsandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz
sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip
small grobid2json test
Diffstat (limited to 'mapreduce')
-rwxr-xr-xmapreduce/extraction_cdx_grobid.py1
-rwxr-xr-xmapreduce/grobid2json.py12
-rw-r--r--mapreduce/tests/files/small.json43
-rw-r--r--mapreduce/tests/files/small.xml110
4 files changed, 164 insertions, 2 deletions
diff --git a/mapreduce/extraction_cdx_grobid.py b/mapreduce/extraction_cdx_grobid.py
index a4a13f8..63f290a 100755
--- a/mapreduce/extraction_cdx_grobid.py
+++ b/mapreduce/extraction_cdx_grobid.py
@@ -61,6 +61,7 @@ class MRExtractCdxGrobid(MRJob):
r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
files={'input': content})
if r.status_code is not 200:
+ # if invalid file, get a 400 with JSON body with 'description' key (and others)
# XXX:
return None
return r
diff --git a/mapreduce/grobid2json.py b/mapreduce/grobid2json.py
index cc6eb2c..52a3125 100755
--- a/mapreduce/grobid2json.py
+++ b/mapreduce/grobid2json.py
@@ -1,6 +1,10 @@
#!/usr/bin/env python3
"""
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
This script tries to extract everything from a GROBID TEI XML fulltext dump:
- header metadata
@@ -38,6 +42,8 @@ def journal_info(elem):
journal = dict()
journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if journal['publisher'] == '':
+ journal['publisher'] = None
journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -59,6 +65,8 @@ def biblio_info(elem):
ref['title'] = other_title
ref['authors'] = all_authors(elem)
ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if ref['publisher'] == '':
+ ref['publisher'] = None
date = elem.find('.//{%s}date[@type="published"]' % ns)
ref['date'] = (date != None) and date.attrib.get('when')
ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
@@ -74,7 +82,7 @@ def biblio_info(elem):
return ref
-def do_tei(content, encumbered=True):
+def teixml2json(content, encumbered=True):
if type(content) == str:
content = io.StringIO(content)
@@ -131,7 +139,7 @@ def main(): # pragma no cover
for filename in args.teifiles:
content = open(filename, 'r')
print(json.dumps(
- do_tei(content,
+ teixml2json(content,
encumbered=(not args.no_encumbered))))
if __name__=='__main__': # pragma no cover
diff --git a/mapreduce/tests/files/small.json b/mapreduce/tests/files/small.json
new file mode 100644
index 0000000..208fb49
--- /dev/null
+++ b/mapreduce/tests/files/small.json
@@ -0,0 +1,43 @@
+{
+ "title": "Dummy Example File",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
diff --git a/mapreduce/tests/files/small.xml b/mapreduce/tests/files/small.xml
new file mode 100644
index 0000000..78b9ba2
--- /dev/null
+++ b/mapreduce/tests/files/small.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">Dummy Example File</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2000">2000</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Brewster</forename><surname>Kahle</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Doe</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">Internet Archive</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">Dummy Example File</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678</title>
+ <imprint>
+ <date type="published" when="2000">2000</date>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Fake Data</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>Everything you ever wanted to know about nothing</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>
+Everything starts somewhere, as somebody<ref type="bibr" target="#b0">[1]</ref> once said.</p></div>
+
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">In Depth</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Meat</head><p>
+You know, for kids.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Potatos</head><p>
+QED.</p></div>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">Everything is Wonderful</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">A</forename><surname>Seaperson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Letters in the Alphabet</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">All about Facts</title>
+ </analytic>
+ <monogr>
+ <title level="j">The Dictionary</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>None</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>