aboutsummaryrefslogtreecommitdiffstats
path: root/mapreduce/tests
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
committerBryan Newbold <bnewbold@archive.org>2018-04-06 12:39:49 -0700
commit114c6b611148d2ff499bcea302eee0eca00df647 (patch)
treedf929050d3aa9484f78e5c1807bc951ce1e85512 /mapreduce/tests
parente68d43e2369eed7ddf288be8c8f2edd0a85974e1 (diff)
downloadsandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.tar.gz
sandcrawler-114c6b611148d2ff499bcea302eee0eca00df647.zip
small grobid2json test
Diffstat (limited to 'mapreduce/tests')
-rw-r--r--mapreduce/tests/files/small.json43
-rw-r--r--mapreduce/tests/files/small.xml110
2 files changed, 153 insertions, 0 deletions
diff --git a/mapreduce/tests/files/small.json b/mapreduce/tests/files/small.json
new file mode 100644
index 0000000..208fb49
--- /dev/null
+++ b/mapreduce/tests/files/small.json
@@ -0,0 +1,43 @@
+{
+ "title": "Dummy Example File",
+ "authors": [
+ {"name": "Brewster Kahle"},
+ {"name": "J Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null
+}
diff --git a/mapreduce/tests/files/small.xml b/mapreduce/tests/files/small.xml
new file mode 100644
index 0000000..78b9ba2
--- /dev/null
+++ b/mapreduce/tests/files/small.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">Dummy Example File</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2000">2000</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Brewster</forename><surname>Kahle</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Doe</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">Internet Archive</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">Dummy Example File</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678</title>
+ <imprint>
+ <date type="published" when="2000">2000</date>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Fake Data</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>Everything you ever wanted to know about nothing</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>
+Everything starts somewhere, as somebody<ref type="bibr" target="#b0">[1]</ref> once said.</p></div>
+
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">In Depth</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Meat</head><p>
+You know, for kids.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Potatos</head><p>
+QED.</p></div>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">Everything is Wonderful</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">A</forename><surname>Seaperson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Letters in the Alphabet</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">All about Facts</title>
+ </analytic>
+ <monogr>
+ <title level="j">The Dictionary</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>None</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>