aboutsummaryrefslogtreecommitdiffstats
path: root/python_hadoop
diff options
context:
space:
mode:
Diffstat (limited to 'python_hadoop')
-rw-r--r--python_hadoop/Pipfile33
-rw-r--r--python_hadoop/Pipfile.lock990
-rw-r--r--python_hadoop/README.md104
-rwxr-xr-xpython_hadoop/backfill_hbase_from_cdx.py88
-rw-r--r--python_hadoop/common.py99
-rwxr-xr-xpython_hadoop/extraction_cdx_grobid.py299
-rwxr-xr-xpython_hadoop/extraction_ungrobided.py292
-rwxr-xr-xpython_hadoop/grobid2json.py154
-rwxr-xr-xpython_hadoop/kafka_grobid_hbase.py200
-rw-r--r--python_hadoop/mrjob.conf16
-rw-r--r--python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml2004
-rw-r--r--python_hadoop/tests/files/example.cdx20
-rw-r--r--python_hadoop/tests/files/example_grobid_metadata.json5
-rw-r--r--python_hadoop/tests/files/example_ungrobided.tsv20
-rw-r--r--python_hadoop/tests/files/small.json46
-rw-r--r--python_hadoop/tests/files/small.xml110
-rw-r--r--python_hadoop/tests/test_backfill_hbase_from_cdx.py74
-rw-r--r--python_hadoop/tests/test_common.py40
-rw-r--r--python_hadoop/tests/test_extraction_cdx_grobid.py319
-rw-r--r--python_hadoop/tests/test_extraction_ungrobided.py178
-rw-r--r--python_hadoop/tests/test_grobid2json.py22
21 files changed, 5113 insertions, 0 deletions
diff --git a/python_hadoop/Pipfile b/python_hadoop/Pipfile
new file mode 100644
index 0000000..42fb095
--- /dev/null
+++ b/python_hadoop/Pipfile
@@ -0,0 +1,33 @@
+[[source]]
+name = "ia"
+url = "https://devpi.archive.org/wb/prod"
+verify_ssl = true
+
+[[source]]
+name = "pypi"
+url = "https://pypi.python.org/simple"
+verify_ssl = true
+
+[dev-packages]
+ipython = "*"
+happybase-mock = "*"
+pytest = "*"
+pytest-pythonpath = "*"
+responses = "*"
+pytest-cov = "*"
+pylint = "*"
+
+[packages]
+globalwayback = {version=">=0.3", index="ia"}
+happybase = "*"
+mrjob = "*"
+requests = "*"
+wayback = {version=">=0.2.1.2", index="ia"}
+xmltodict = "*"
+raven = "*"
+pykafka = "*"
+python-snappy = "*"
+boto3 = "*"
+
+[requires]
+python_version = "3.5"
diff --git a/python_hadoop/Pipfile.lock b/python_hadoop/Pipfile.lock
new file mode 100644
index 0000000..1d53667
--- /dev/null
+++ b/python_hadoop/Pipfile.lock
@@ -0,0 +1,990 @@
+{
+ "_meta": {
+ "hash": {
+ "sha256": "d86e088fe8fe61715668eb35fa7a1d0a78670a782754b556aee0c7f741916aad"
+ },
+ "pipfile-spec": 6,
+ "requires": {
+ "python_version": "3.5"
+ },
+ "sources": [
+ {
+ "name": "ia",
+ "url": "https://devpi.archive.org/wb/prod",
+ "verify_ssl": true
+ },
+ {
+ "name": "pypi",
+ "url": "https://pypi.python.org/simple",
+ "verify_ssl": true
+ }
+ ]
+ },
+ "default": {
+ "boto3": {
+ "hashes": [
+ "sha256:817b6f5e5277a9e370702314adbfcaa6957e138540e50d6b557a717846c6c999",
+ "sha256:8880415ca6d2531dd76c392a00824d952a3074886352bb342c8f8f1cb9403c1a"
+ ],
+ "index": "ia",
+ "version": "==1.9.99"
+ },
+ "botocore": {
+ "hashes": [
+ "sha256:9092d61cbf8052471dcaaac29f8cd1b9dbd5687947719f40dbc30a72c87523f2",
+ "sha256:ac50b9f793164a00ca725dfe60fe2d12a967272b251e6533236139dcade1ee5c"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==1.12.99"
+ },
+ "cachetools": {
+ "hashes": [
+ "sha256:219b7dc6024195b6f2bc3d3f884d1fef458745cd323b04165378622dcc823852",
+ "sha256:9efcc9fab3b49ab833475702b55edd5ae07af1af7a4c627678980b45e459c460"
+ ],
+ "version": "==3.1.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7",
+ "sha256:993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"
+ ],
+ "version": "==2018.11.29"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "click": {
+ "hashes": [
+ "sha256:2335065e6395b9e67ca716de5f7526736bfa6ceead690adf616d925bdc622b13",
+ "sha256:5b94b49521f6456670fdb30cd82a4eca9412788a93fa6dd6df72c94d5a8ff2d7"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==7.0"
+ },
+ "crawllib": {
+ "hashes": [
+ "sha256:01c47e22757482a7ffa15396a12dcfe27ac90347b1759ae6804d02a7ef6888cf"
+ ],
+ "version": "==0.1.4.1"
+ },
+ "dawg": {
+ "hashes": [
+ "sha256:111aec946fc6045776e8a977f8be841b099769f3c8ab041dba4773ffeda21ad5",
+ "sha256:30d5da3e48b8cbe5ec94c5a202d2962780d3895ba0883123e6788565f71b2953",
+ "sha256:3a5ea13d5a424542d1a7fa908db974e712be90ccdd86cec9e24c6b20794f5f5e",
+ "sha256:402659e3044a5fb79dadefeaabb15ba9c0ef56c844bb4bcde6b102afbf4788f8",
+ "sha256:7accbfe484a353e1f02a947f84f817846f30738d1170d4e855f536d5708632a3",
+ "sha256:7d0a904e91adfa3de7071bfe64cd1334ce4040f1795cca8c13598bd075e72e18",
+ "sha256:9c7321d4f2a580506e06c29ed276ae50df9eb153470e8e980e79409e12b18e55",
+ "sha256:ad0fdd2f6ed0a0155f00e7f61f3649898dabf7e344eb87732b34414f34cc31d9",
+ "sha256:b1f9c72bb3eca530f78fcf82f2d60ff41298f10e1c9f018b402af0ecbe246171",
+ "sha256:d6d5f9e4a37bf9b2c4fec504eaf8cfc30d7f994635c35a6f14ced5f41a72e2f9"
+ ],
+ "version": "==0.7.8"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
+ "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "docutils": {
+ "hashes": [
+ "sha256:02aec4bd92ab067f6ff27a38a38a41173bf01bed8f89157768c1573f53e474a6",
+ "sha256:51e64ef2ebfb29cae1faa133b3710143496eca21c530f3f71424d77687764274",
+ "sha256:7a4bd47eaf6596e1295ecb11361139febe29b084a87bf005bf899f9a42edc3c6"
+ ],
+ "version": "==0.14"
+ },
+ "dogpile.cache": {
+ "hashes": [
+ "sha256:691b7f199561c4bd6e7e96f164a43cc3781b0c87bea29b7d59d859f873fd4a31"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.7.1"
+ },
+ "elasticsearch": {
+ "hashes": [
+ "sha256:658380fd60bdaf746fef12958f0abc49063218ce93ee1ae4ca1fe6291c896433",
+ "sha256:ae91b089f2f2b5b3daa04297949e5f805ab12d187218cb587273f472656fd250"
+ ],
+ "markers": "python_version != '3.2.*' and python_version != '3.0.*' and python_version >= '2.7' and python_version < '4' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==5.5.3"
+ },
+ "flask": {
+ "hashes": [
+ "sha256:2271c0070dbcb5275fad4a82e29f23ab92682dc45f9dfbc22c02ba9b9322ce48",
+ "sha256:a080b744b7e345ccfcbc77954861cb05b3c63786e93f2b3875e0913d44b43f05"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.0.2"
+ },
+ "globalwayback": {
+ "hashes": [
+ "sha256:257c93800c82f77c35002978b2fd9db8e60e40744def0b18eea6b34e704260b8"
+ ],
+ "index": "ia",
+ "version": "==0.3.23.1"
+ },
+ "google-api-core": {
+ "hashes": [
+ "sha256:85693e163a1a6faea69a74f8feaf35d54dfa2559fbdbbe389c93ffb3bb4c9a79",
+ "sha256:eea2d223f7bdc6d68dd1c4681e17cded5a00b5a8e686e1597b89f27f58cf2980"
+ ],
+ "version": "==1.7.0"
+ },
+ "google-auth": {
+ "hashes": [
+ "sha256:0f7c6a64927d34c1a474da92cfc59e552a5d3b940d3266606c6a28b72888b9e4",
+ "sha256:20705f6803fd2c4d1cc2dcb0df09d4dfcb9a7d51fd59e94a3a28231fd93119ed"
+ ],
+ "version": "==1.6.3"
+ },
+ "google-cloud-core": {
+ "hashes": [
+ "sha256:9bee63e0991be9801a4baf0b7841cf54f86c6e7fec922f45ea74cd4032ed4ee4",
+ "sha256:d85b1aaaf3bad9415ad1d8ee5eadce96d7007a82f13ce0a0629a003a11e83f29"
+ ],
+ "version": "==0.29.1"
+ },
+ "google-cloud-dataproc": {
+ "hashes": [
+ "sha256:785e645690f344873cd6f22454db2a39236a2ce5af2b392efbb91ad57944ebac",
+ "sha256:e6a6c380757e22e9a45cf5b261be6d6a4262f87ee172a6c21f6f7ad6013827cd"
+ ],
+ "version": "==0.3.1"
+ },
+ "google-cloud-logging": {
+ "hashes": [
+ "sha256:104e8013afa3a75a8b40240205d7078b04dded332a29b0042b16df58f81c9a8c",
+ "sha256:13ac67399289b202b409e6cef7a87dea32ddabf902f69a677bd05554f6aecf0b"
+ ],
+ "version": "==1.10.0"
+ },
+ "google-cloud-storage": {
+ "hashes": [
+ "sha256:a3115c22a71e2f172fade72c7b7b797a071f3ac9b66043191fc84c214ba0c671",
+ "sha256:aef243b533144c11c9ff750565c43dffe5445debb143697002edb6205f64a437"
+ ],
+ "version": "==1.14.0"
+ },
+ "google-resumable-media": {
+ "hashes": [
+ "sha256:2dae98ee716efe799db3578a7b902fbf5592fc5c77d3c0906fc4ef9b1b930861",
+ "sha256:3e38923493ca0d7de0ad91c31acfefc393c78586db89364e91cb4f11990e51ba"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.3.2"
+ },
+ "googleapis-common-protos": {
+ "hashes": [
+ "sha256:d56ca712f67fff216d3be9eeeb8360ca59066d0365ba70b137b9e1801813747e"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==1.5.8"
+ },
+ "grpcio": {
+ "hashes": [
+ "sha256:0134bab8e8d16b195547f9216517b3abcd3e4b6b1f5a1c8940099888003287ac",
+ "sha256:084d4a5f34a671bd0ec4668d3a7a3351015de81e6d4aef6710d9dab026def8cc",
+ "sha256:1ab29724526d8651c8b878257775e17cf3fba7474c01edc76ff8bcfecf570f91",
+ "sha256:1bd017ca22a126af0d7d67b4140b427ae58fd6d79dbd277e6f21be3ee0fdfef7",
+ "sha256:25e7b619973e20d8f2cf05d6af0f2e11263a8792b99c058a5b590ef7aef554b8",
+ "sha256:2e836e6092e6639cc9edb486f27c6fe078408aac54ed345c5762edcf8588d9c2",
+ "sha256:34870eb5d157fe9639f263f0bfe0bcdc1737a6c08181ce113585f6461f37c84b",
+ "sha256:424c8f0748935932d28531ce6d817a11914dfb385b86fe815297f122cd04d592",
+ "sha256:43c42570f769748982c61a249e01eec5f91149e2aa98438c893de64e649d562b",
+ "sha256:4f845d13ecff25012fc9c7f22067fca1d2b3da3f693da146ddcc587fdab3e7b4",
+ "sha256:614de7d6672eb023c08dde70b103efa9faacf86ac63b2a24f8d74b064a86f6f0",
+ "sha256:6c5956292692f385bb12b5f47afd70ae9469d2ee07a949c94aef2946020c1300",
+ "sha256:7030674682433a5cbc069cd5a5fbcdf193c8a3680dc161cd7b984f72ab609f23",
+ "sha256:77fff21bee2d3c3487891cdb69b35190deddac609e48c05262e1097f0b2cd82a",
+ "sha256:8ac64f3e17e6a13abf9628f0ba22012c948d7ab400592510fed3c62444bdcc0d",
+ "sha256:8fdfa8129e1ab2cdf053956dd07b21ccc127c8a8f0c5b83ff60987c009ddb636",
+ "sha256:8ff4935abf61206479dd42c56aba0f6c395aebb5c42b29b1f7c2faae41ad979c",
+ "sha256:9af47d0f4137a2951b73ee592bdc5690b242cfe81cdfacba1b34becbf72a0d59",
+ "sha256:9da5b3c883621afca008d2c5729ddd7f06153f5dcaae1f690bead9b9018a3594",
+ "sha256:abe825aa49e6239d5edf4e222c44170d2c7f6f4b1fd5286b4756a62d8067e112",
+ "sha256:c8330efa27af2b65aa556a66517ba6657a13e259670ad32dec1b6ff3d6616c3c",
+ "sha256:dc3d09abe7b49e84516b53920320d0f0d05587f6398431e50d6a47bd7d27a8b6",
+ "sha256:deb08edefef880609f8bd2945764f31d577785ff3f2daea7027b67432ff12f74",
+ "sha256:e019c86f55cdcd2bbc239beab14167f2e03ee92407c7c42ddf42edf6f5640cce",
+ "sha256:eb0d154c4749458353fbb5a55b39de7aa8445617c20d200729f924be125c56d0",
+ "sha256:eed5edb8f2620ad1157c8c5786809fb0a2d885969287a758752ce514274e3be0",
+ "sha256:f7a9fc2dfbbc0e838c79f908262638fb86ab326b0fbc0ea2c3dd063b3561e9e2",
+ "sha256:f9df2e626f1a8d8114a9dc05a489bdf26a8e926fbbe43112669700f25fe0abb3"
+ ],
+ "version": "==1.18.0"
+ },
+ "happybase": {
+ "hashes": [
+ "sha256:e20376e2e32291798d2226502994134c1c4e175136d8375b3c517a234fa22481"
+ ],
+ "index": "ia",
+ "version": "==1.1.0"
+ },
+ "ialib": {
+ "hashes": [
+ "sha256:30291b8645057cc210d7ec129f17dc25afc63ee09db7cda1657c47408b2ba8dc"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.3.0.1"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "itsdangerous": {
+ "hashes": [
+ "sha256:321b033d07f2a4136d3ec762eac9f16a10ccd60f53c0c91af90217ace7ba1f19",
+ "sha256:b12271b2047cb23eeb98c8b5622e2e5c5e9abd9784a153e9d8ef9cb4dd09d749"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.1.0"
+ },
+ "jinja2": {
+ "hashes": [
+ "sha256:74c935a1b8bb9a3947c50a54766a969d4846290e1e788ea44c1392163723c3bd",
+ "sha256:f84be1bb0040caca4cea721fcbbbbd61f9be9464ca236387158b0feea01914a4"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==2.10"
+ },
+ "jmespath": {
+ "hashes": [
+ "sha256:6a81d4c9aa62caf061cb517b4d9ad1dd300374cd4706997aff9cd6aedd61fc64",
+ "sha256:f11b4461f425740a1d908e9a3f7365c3d2e569f6ca68a2ff8bc5bcd9676edd63"
+ ],
+ "version": "==0.9.3"
+ },
+ "kazoo": {
+ "hashes": [
+ "sha256:8db774f7bdece7d0dc7decb21539ff0852e42c2ffe1c28d7f1ff6f9292a1c3a4",
+ "sha256:a5fa2e400c5068cfee9e86b35cf0dab8232b574152d8e3590d823b3e2426ab5e"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==2.5.0"
+ },
+ "markupsafe": {
+ "hashes": [
+ "sha256:048ef924c1623740e70204aa7143ec592504045ae4429b59c30054cb31e3c432",
+ "sha256:130f844e7f5bdd8e9f3f42e7102ef1d49b2e6fdf0d7526df3f87281a532d8c8b",
+ "sha256:19f637c2ac5ae9da8bfd98cef74d64b7e1bb8a63038a3505cd182c3fac5eb4d9",
+ "sha256:1b8a7a87ad1b92bd887568ce54b23565f3fd7018c4180136e1cf412b405a47af",
+ "sha256:1c25694ca680b6919de53a4bb3bdd0602beafc63ff001fea2f2fc16ec3a11834",
+ "sha256:1f19ef5d3908110e1e891deefb5586aae1b49a7440db952454b4e281b41620cd",
+ "sha256:1fa6058938190ebe8290e5cae6c351e14e7bb44505c4a7624555ce57fbbeba0d",
+ "sha256:31cbb1359e8c25f9f48e156e59e2eaad51cd5242c05ed18a8de6dbe85184e4b7",
+ "sha256:3e835d8841ae7863f64e40e19477f7eb398674da6a47f09871673742531e6f4b",
+ "sha256:4e97332c9ce444b0c2c38dd22ddc61c743eb208d916e4265a2a3b575bdccb1d3",
+ "sha256:525396ee324ee2da82919f2ee9c9e73b012f23e7640131dd1b53a90206a0f09c",
+ "sha256:52b07fbc32032c21ad4ab060fec137b76eb804c4b9a1c7c7dc562549306afad2",
+ "sha256:52ccb45e77a1085ec5461cde794e1aa037df79f473cbc69b974e73940655c8d7",
+ "sha256:5c3fbebd7de20ce93103cb3183b47671f2885307df4a17a0ad56a1dd51273d36",
+ "sha256:5e5851969aea17660e55f6a3be00037a25b96a9b44d2083651812c99d53b14d1",
+ "sha256:5edfa27b2d3eefa2210fb2f5d539fbed81722b49f083b2c6566455eb7422fd7e",
+ "sha256:7d263e5770efddf465a9e31b78362d84d015cc894ca2c131901a4445eaa61ee1",
+ "sha256:83381342bfc22b3c8c06f2dd93a505413888694302de25add756254beee8449c",
+ "sha256:857eebb2c1dc60e4219ec8e98dfa19553dae33608237e107db9c6078b1167856",
+ "sha256:98e439297f78fca3a6169fd330fbe88d78b3bb72f967ad9961bcac0d7fdd1550",
+ "sha256:bf54103892a83c64db58125b3f2a43df6d2cb2d28889f14c78519394feb41492",
+ "sha256:d9ac82be533394d341b41d78aca7ed0e0f4ba5a2231602e2f05aa87f25c51672",
+ "sha256:e982fe07ede9fada6ff6705af70514a52beb1b2c3d25d4e873e82114cf3c5401",
+ "sha256:edce2ea7f3dfc981c4ddc97add8a61381d9642dc3273737e756517cc03e84dd6",
+ "sha256:efdc45ef1afc238db84cb4963aa689c0408912a0239b0721cb172b4016eb31d6",
+ "sha256:f137c02498f8b935892d5c0172560d7ab54bc45039de8805075e19079c639a9c",
+ "sha256:f82e347a72f955b7017a39708a3667f106e6ad4d10b25f237396a7115d8ed5fd",
+ "sha256:fb7c206e01ad85ce57feeaaa0bf784b97fa3cad0d4a5737bc5295785f5c613a1"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==1.1.0"
+ },
+ "mrjob": {
+ "hashes": [
+ "sha256:1979504fd9a65ee0889ac7e4151fc30f1d32a6bdb2d3c74462b9f104aaf9be68",
+ "sha256:3ba27276e213d317efdd7183044d95f93c3a2175b16016f6b6506aeae1cd54d4"
+ ],
+ "index": "ia",
+ "version": "==0.6.7"
+ },
+ "pillow": {
+ "hashes": [
+ "sha256:1263e38b91ca0132c77d5ae5a4d396bce7e7b1d13427b5d2982ac8f5bfbef62b",
+ "sha256:2602c7152e26f5bece294edb97af40345409ae55f8ad2a6d5da4380f4178defe",
+ "sha256:3183b19cdd6fb5c68498334601eba770bc7abd44977b4119e4fa49d45e12845b",
+ "sha256:3c6133d928643167af35a1dd012889e6ff2e407895d7e16c2425cdab1ab1d608",
+ "sha256:412f4999794a80c9153cd2156f040b8e570b145d2edf5830854578ffb0b27cac",
+ "sha256:4678857a6dd0834a77ad6b5eb75a6d79753aa1a13f54f1c47fdb1e9bca63f389",
+ "sha256:486f4ccddee09429cb1c63ea56c02894aecf9d69acdcaf006c53835df2549fff",
+ "sha256:520dfe2ed09ea90a82d6876e87e82c82ba390d2b2936a95d8e9997eca281546f",
+ "sha256:5cda8efe9e0849858986c06cb068ac4de0933780f84fa989d6dae2a85c304d2b",
+ "sha256:6a06f165dcec5789fd98a5d4fe542619ffd3b86b9bf616d1a54d824e9428c6d3",
+ "sha256:77eac8ee2b400be84618ab5876b0e59fe98e32fc4d99aaa34bf413e125361a05",
+ "sha256:95bd8811ad4ece9df7b8cb9a1eef6184b80b6b8b8c199751ab0a5fb48ae82f64",
+ "sha256:9992d8f4b4ad53467ea76e6b796c18e22ec948dcee064be07fb43c155472e1d7",
+ "sha256:9c116c9784689685ee0c2a6bf74d9bb7a8c8134a93e96d12039eead2065f6842",
+ "sha256:a0b7eeee0346ca67cdd9b23a613de3fe71a4c46419c37bdfef69b82dd32a9a0a",
+ "sha256:a47f8b12541ffc219a0f26030daee2a57d1251cfd76a9101cbea74674909d5a3",
+ "sha256:c34d10dda36d64cecf78bc4689758eca1e79b1e88f6e1d8c7cf207e6b9e7c984",
+ "sha256:d7cf28e14b55e2f8848fb5e37655ffe13a0d5846cccc6ba46e031d0cf21879a3",
+ "sha256:ddef2a522ba13348ecec354d6c4d2e24bd68fba2605d7c32682bc0140d9c4e9c",
+ "sha256:e496387e51fec8d8b98312be0d4332dcffecbd60b42ddfa834baaea62cbddfcb",
+ "sha256:e784b1a9fc54ae88a7171aef60a38c2ec0dc463f066691765d11748e014ce2a0",
+ "sha256:f040b4709cba8922f60de441684b3d061fedb61c6ca50d231df8a4d55e45943c",
+ "sha256:f336019509df1a042b7d6bed69a0cb6c52108b6327ce936c2870145dc18f1394"
+ ],
+ "version": "==3.1.1"
+ },
+ "ply": {
+ "hashes": [
+ "sha256:00c7c1aaa88358b9c765b6d3000c6eec0ba42abca5351b095321aef446081da3",
+ "sha256:096f9b8350b65ebd2fd1346b12452efe5b9607f7482813ffca50c22722a807ce"
+ ],
+ "version": "==3.11"
+ },
+ "protobuf": {
+ "hashes": [
+ "sha256:10394a4d03af7060fa8a6e1cbf38cea44be1467053b0aea5bbfcb4b13c4b88c4",
+ "sha256:1489b376b0f364bcc6f89519718c057eb191d7ad6f1b395ffd93d1aa45587811",
+ "sha256:1931d8efce896981fe410c802fd66df14f9f429c32a72dd9cfeeac9815ec6444",
+ "sha256:196d3a80f93c537f27d2a19a4fafb826fb4c331b0b99110f985119391d170f96",
+ "sha256:46e34fdcc2b1f2620172d3a4885128705a4e658b9b62355ae5e98f9ea19f42c2",
+ "sha256:4b92e235a3afd42e7493b281c8b80c0c65cbef45de30f43d571d1ee40a1f77ef",
+ "sha256:574085a33ca0d2c67433e5f3e9a0965c487410d6cb3406c83bdaf549bfc2992e",
+ "sha256:59cd75ded98094d3cf2d79e84cdb38a46e33e7441b2826f3838dcc7c07f82995",
+ "sha256:5ee0522eed6680bb5bac5b6d738f7b0923b3cafce8c4b1a039a6107f0841d7ed",
+ "sha256:65917cfd5da9dfc993d5684643063318a2e875f798047911a9dd71ca066641c9",
+ "sha256:685bc4ec61a50f7360c9fd18e277b65db90105adbf9c79938bd315435e526b90",
+ "sha256:92e8418976e52201364a3174e40dc31f5fd8c147186d72380cbda54e0464ee19",
+ "sha256:9335f79d1940dfb9bcaf8ec881fb8ab47d7a2c721fb8b02949aab8bbf8b68625",
+ "sha256:a7ee3bb6de78185e5411487bef8bc1c59ebd97e47713cba3c460ef44e99b3db9",
+ "sha256:ceec283da2323e2431c49de58f80e1718986b79be59c266bb0509cbf90ca5b9e",
+ "sha256:fcfc907746ec22716f05ea96b7f41597dfe1a1c088f861efb8a0d4f4196a6f10"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==3.6.1"
+ },
+ "publicsuffix": {
+ "hashes": [
+ "sha256:99a3a06d6eb19c57057d17560908b757995396ad76e6513c9d17e6a7a1266c91",
+ "sha256:ae77593d269e1e5131723259cc1142c25690c20c59f2e98f67e227228028bda9",
+ "sha256:eeb90d6cb0ae26d3af43f4d53f4c5eb6cfa437ad16a73c06c6caabb8f36ae1e5"
+ ],
+ "version": "==1.1.0"
+ },
+ "pyasn1": {
+ "hashes": [
+ "sha256:da2420fe13a9452d8ae97a0e478adde1dee153b11ba832a95b223a2ba01c10f7",
+ "sha256:da6b43a8c9ae93bc80e2739efb38cc776ba74a886e3e9318d65fe81a8b8a2c6e"
+ ],
+ "version": "==0.4.5"
+ },
+ "pyasn1-modules": {
+ "hashes": [
+ "sha256:79580acf813e3b7d6e69783884e6e83ac94bf4617b36a135b85c599d8a818a7b",
+ "sha256:a52090e8c5841ebbf08ae455146792d9ef3e8445b21055d3a3b7ed9c712b7c7c"
+ ],
+ "version": "==0.2.4"
+ },
+ "pykafka": {
+ "hashes": [
+ "sha256:f0bbd394ae6970042a587c99fe4dc0966e67787249d963d4ce2f810dc9490577"
+ ],
+ "index": "ia",
+ "version": "==2.8.0"
+ },
+ "pylru": {
+ "hashes": [
+ "sha256:e03a3d354eb8fdfa11638698e8a1f06cd3b3a214ebc0a120c603a79290d9ebec"
+ ],
+ "version": "==1.1.0"
+ },
+ "pymysql": {
+ "hashes": [
+ "sha256:3943fbbbc1e902f41daf7f9165519f140c4451c179380677e6a848587042561a",
+ "sha256:d8c059dcd81dedb85a9f034d5e22dcb4442c0b201908bede99e306d65ea7c8e7"
+ ],
+ "version": "==0.9.3"
+ },
+ "python-dateutil": {
+ "hashes": [
+ "sha256:7e6584c74aeed623791615e26efd690f29817a27c73085b78e4bad02493df2fb",
+ "sha256:c89805f6f4d64db21ed966fda138f8a5ed7a4fdbc1a8ee329ce1b74e3c74da9e"
+ ],
+ "markers": "python_version >= '2.7'",
+ "version": "==2.8.0"
+ },
+ "python-snappy": {
+ "hashes": [
+ "sha256:59c79d83350f931ad5cf8f06ccb1c9bd1087a77c3ca7e00806884cda654a6faf",
+ "sha256:5fb0e2e5487e8ee462838ff928a186ba682bf519921d9b204db7d2b4fb6ced16",
+ "sha256:64ced2234becfe661962bc4c152e38cea03a2343ad6206a45d04c9ce61ad640f",
+ "sha256:748c2c9fec50d8a88861f369083067ec35b4a5d234f07b94bca70c6f89408f14",
+ "sha256:8a7f803f06083d4106d55387d2daa32c12b5e376c3616b0e2da8b8a87a27d74a"
+ ],
+ "index": "ia",
+ "version": "==0.5.3"
+ },
+ "pytz": {
+ "hashes": [
+ "sha256:32b0891edff07e28efe91284ed9c31e123d84bea3fd98e1f72be2508f43ef8d9",
+ "sha256:d5f05e487007e29e03409f9398d074e158d920d36eb82eaf66fb1136b0c5374c"
+ ],
+ "version": "==2018.9"
+ },
+ "pyyaml": {
+ "hashes": [
+ "sha256:3d7da3009c0f3e783b2c873687652d83b1bbfd5c88e9813fb7e5b03c0dd3108b",
+ "sha256:3ef3092145e9b70e3ddd2c7ad59bdd0252a94dfe3949721633e41344de00a6bf",
+ "sha256:40c71b8e076d0550b2e6380bada1f1cd1017b882f7e16f09a65be98e017f211a",
+ "sha256:558dd60b890ba8fd982e05941927a3911dc409a63dcb8b634feaa0cda69330d3",
+ "sha256:a7c28b45d9f99102fa092bb213aa12e0aaf9a6a1f5e395d36166639c1f96c3a1",
+ "sha256:aa7dd4a6a427aed7df6fb7f08a580d68d9b118d90310374716ae90b710280af1",
+ "sha256:bc558586e6045763782014934bfaf39d48b8ae85a2713117d16c39864085c613",
+ "sha256:d46d7982b62e0729ad0175a9bc7e10a566fc07b224d2c79fafb5e032727eaa04",
+ "sha256:d5eef459e30b09f5a098b9cea68bebfeb268697f78d647bd255a085371ac7f3f",
+ "sha256:e01d3203230e1786cd91ccfdc8f8454c8069c91bee3962ad93b87a4b2860f537",
+ "sha256:e170a9e6fcfd19021dd29845af83bb79236068bf5fd4df3327c1be18182b2531"
+ ],
+ "version": "==3.13"
+ },
+ "raven": {
+ "hashes": [
+ "sha256:3fa6de6efa2493a7c827472e984ce9b020797d0da16f1db67197bcc23c8fae54",
+ "sha256:44a13f87670836e153951af9a3c80405d36b43097db869a36e92809673692ce4"
+ ],
+ "index": "ia",
+ "version": "==6.10.0"
+ },
+ "redis": {
+ "hashes": [
+ "sha256:724932360d48e5407e8f82e405ab3650a36ed02c7e460d1e6fddf0f038422b54",
+ "sha256:9b19425a38fd074eb5795ff2b0d9a55b46a44f91f5347995f27e3ad257a7d775"
+ ],
+ "markers": "python_version >= '2.7' and python_version != '3.2.*' and python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*'",
+ "version": "==3.2.0"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
+ "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
+ ],
+ "index": "ia",
+ "version": "==2.21.0"
+ },
+ "requests-file": {
+ "hashes": [
+ "sha256:75c175eed739270aec3c5279ffd74e6527dada275c5c0d76b5817e9c86bb7dea",
+ "sha256:8f04aa6201bacda0567e7ac7f677f1499b0fc76b22140c54bc06edf1ba92e2fa"
+ ],
+ "version": "==1.4.3"
+ },
+ "robotexclusionrulesparser": {
+ "hashes": [
+ "sha256:d23aa14ae8145c13c95612d696736bad52a4bd0819ce8c9437ee745098fb8388"
+ ],
+ "version": "==1.7.1"
+ },
+ "rsa": {
+ "hashes": [
+ "sha256:14ba45700ff1ec9eeb206a2ce76b32814958a98e372006c8fb76ba820211be66",
+ "sha256:1a836406405730121ae9823e19c6e806c62bbad73f890574fff50efa4122c487"
+ ],
+ "version": "==4.0"
+ },
+ "s3transfer": {
+ "hashes": [
+ "sha256:7b9ad3213bff7d357f888e0fab5101b56fa1a0548ee77d121c3a3dbfbef4cb2e",
+ "sha256:f23d5cb7d862b104401d9021fc82e5fa0e0cf57b7660a1331425aab0c691d021"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.2.0"
+ },
+ "schedule": {
+ "hashes": [
+ "sha256:3f895a1036799a25ab9c335de917073e63cf8256920917e932777382f101f08f",
+ "sha256:f9fb5181283de4db6e701d476dd01b6a3dd81c38462a54991ddbb9d26db857c9"
+ ],
+ "version": "==0.6.0"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==1.12.0"
+ },
+ "sqlalchemy": {
+ "hashes": [
+ "sha256:8027fa183f5be466030617a497b2d64e0e16c8d615e5a34bdf9fab6f66bf4723"
+ ],
+ "version": "==1.2.18"
+ },
+ "surt": {
+ "hashes": [
+ "sha256:5691e63b189af04aa1fb178ecce5fc7d872cc582e2b6861d4500f6d41915306a"
+ ],
+ "version": "==0.3.1"
+ },
+ "tabulate": {
+ "hashes": [
+ "sha256:8af07a39377cee1103a5c8b3330a421c2d99b9141e9cc5ddd2e3263fea416943"
+ ],
+ "version": "==0.8.3"
+ },
+ "thriftpy": {
+ "hashes": [
+ "sha256:309e57d97b5bfa01601393ad4f245451e989d6206a59279e56866b264a99796d",
+ "sha256:498960d6a4ebeaea1da4d85cea5d86b59c5a7aa93d5bc4c605ac33a11699e9db",
+ "sha256:6060f6354ba5aa3c0b071d87c216394d10b9116015bdba26634bafcaff86e0ca",
+ "sha256:67d8501b88e4ead17e3008db2261bcda5845e63d1e83b8168c5d96056990af3a",
+ "sha256:6baceabd40f0934186ebcfd1f559d34a9f165b65ac5d396a39ef7f61e44d9156"
+ ],
+ "version": "==0.3.9"
+ },
+ "tldextract": {
+ "hashes": [
+ "sha256:29797125db1f2e72ce2ee51f7a764ec8b1e6588812520795ffeae93bcd46bab4",
+ "sha256:84a0b275c262e34df7506e10767e357e8b5a755a3a620cdc2cfe035061f7806d"
+ ],
+ "version": "==2.2.0"
+ },
+ "twitter": {
+ "hashes": [
+ "sha256:52545fd3b70d3d3807d3ce62d1a256727856d784d1630d64dedcc643aaf0b908",
+ "sha256:acdc85e5beea752967bb64c63bde8b915c49a31a01db1b2fecccf9f2c1d5c44d"
+ ],
+ "version": "==1.18.0"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ ],
+ "markers": "python_version >= '3.4'",
+ "version": "==1.22"
+ },
+ "warctools": {
+ "hashes": [
+ "sha256:ce0c6e274db8ac8810f7c97b3943e8e8deadbc3f5c982db77cddaae2d2ae6170"
+ ],
+ "version": "==4.10.0"
+ },
+ "wayback": {
+ "hashes": [
+ "sha256:e095116ce5b71e2efb06afe6bdfbc7923906aeb87dc00f1225c2b7f7013070f6"
+ ],
+ "index": "ia",
+ "version": "==0.4.1.1"
+ },
+ "wayback-esp": {
+ "hashes": [
+ "sha256:4cd5d38da78115c07f6d95f109f7f5324b874c19ae1e59c2b026a4d707879b58"
+ ],
+ "version": "==0.2.2.2"
+ },
+ "wayback-search-js": {
+ "hashes": [
+ "sha256:0f358635e12c60d41625e1d1e0ec8fc76602f2c32c08337693a2406289abbe08"
+ ],
+ "version": "==1.4.17"
+ },
+ "wbex-client": {
+ "hashes": [
+ "sha256:447611c3df85175854c063ed784bb928f03262ad9a50fab2d74531c59200d94c"
+ ],
+ "markers": "python_version >= '2.6' and python_version != '3.0.*' and python_version != '3.1.*'",
+ "version": "==0.1.5"
+ },
+ "werkzeug": {
+ "hashes": [
+ "sha256:c3fd7a7d41976d9f44db327260e263132466836cef6f91512889ed60ad26557c",
+ "sha256:d5da73735293558eb1651ee2fddc4d0dedcfa06538b8813a2e20011583c9e49b"
+ ],
+ "version": "==0.14.1"
+ },
+ "xmltodict": {
+ "hashes": [
+ "sha256:50d8c638ed7ecb88d90561beedbf720c9b4e851a9fa6c47ebd64e99d166d8a21",
+ "sha256:8bbcb45cc982f48b2ca8fe7e7827c5d792f217ecf1792626f808bf41c3b86051"
+ ],
+ "index": "ia",
+ "version": "==0.12.0"
+ }
+ },
+ "develop": {
+ "astroid": {
+ "hashes": [
+ "sha256:35b032003d6a863f5dcd7ec11abd5cd5893428beaa31ab164982403bcb311f22",
+ "sha256:6a5d668d7dc69110de01cdf7aeec69a679ef486862a0850cc0fd5571505b6b7e"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.1.0"
+ },
+ "atomicwrites": {
+ "hashes": [
+ "sha256:03472c30eb2c5d1ba9227e4c2ca66ab8287fbfbbda3888aa93dc2e28fc6811b4",
+ "sha256:75a9445bac02d8d058d5e1fe689654ba5a6556a1dfd8ce6ec55a0ed79866cfa6"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==1.3.0"
+ },
+ "attrs": {
+ "hashes": [
+ "sha256:10cbf6e27dbce8c30807caf056c8eb50917e0eaafe86347671b57254006c3e69",
+ "sha256:ca4be454458f9dec299268d472aaa5a11f67a4ff70093396e1ceae9c76cf4bbb"
+ ],
+ "version": "==18.2.0"
+ },
+ "backcall": {
+ "hashes": [
+ "sha256:38ecd85be2c1e78f77fd91700c76e14667dc21e2713b63876c0eb901196e01e4",
+ "sha256:bbbf4b1e5cd2bdb08f915895b51081c041bac22394fdfcfdfbe9f14b77c08bf2"
+ ],
+ "version": "==0.1.0"
+ },
+ "certifi": {
+ "hashes": [
+ "sha256:47f9c83ef4c0c621eaef743f133f09fa8a74a9b75f037e8624f83bd1b6626cb7",
+ "sha256:993f830721089fef441cdfeb4b2c8c9df86f0c63239f06bd025a76a7daddb033"
+ ],
+ "version": "==2018.11.29"
+ },
+ "chardet": {
+ "hashes": [
+ "sha256:84ab92ed1c4d4f16916e05906b6b75a6c0fb5db821cc65e70cbd64a3e2a5eaae",
+ "sha256:fc323ffcaeaed0e0a02bf4d117757b98aed530d9ed4531e3e15460124c106691"
+ ],
+ "version": "==3.0.4"
+ },
+ "coverage": {
+ "hashes": [
+ "sha256:09e47c529ff77bf042ecfe858fb55c3e3eb97aac2c87f0349ab5a7efd6b3939f",
+ "sha256:0a1f9b0eb3aa15c990c328535655847b3420231af299386cfe5efc98f9c250fe",
+ "sha256:0cc941b37b8c2ececfed341444a456912e740ecf515d560de58b9a76562d966d",
+ "sha256:10e8af18d1315de936d67775d3a814cc81d0747a1a0312d84e27ae5610e313b0",
+ "sha256:1b4276550b86caa60606bd3572b52769860a81a70754a54acc8ba789ce74d607",
+ "sha256:1e8a2627c48266c7b813975335cfdea58c706fe36f607c97d9392e61502dc79d",
+ "sha256:2b224052bfd801beb7478b03e8a66f3f25ea56ea488922e98903914ac9ac930b",
+ "sha256:447c450a093766744ab53bf1e7063ec82866f27bcb4f4c907da25ad293bba7e3",
+ "sha256:46101fc20c6f6568561cdd15a54018bb42980954b79aa46da8ae6f008066a30e",
+ "sha256:4710dc676bb4b779c4361b54eb308bc84d64a2fa3d78e5f7228921eccce5d815",
+ "sha256:510986f9a280cd05189b42eee2b69fecdf5bf9651d4cd315ea21d24a964a3c36",
+ "sha256:5535dda5739257effef56e49a1c51c71f1d37a6e5607bb25a5eee507c59580d1",
+ "sha256:5a7524042014642b39b1fcae85fb37556c200e64ec90824ae9ecf7b667ccfc14",
+ "sha256:5f55028169ef85e1fa8e4b8b1b91c0b3b0fa3297c4fb22990d46ff01d22c2d6c",
+ "sha256:6694d5573e7790a0e8d3d177d7a416ca5f5c150742ee703f3c18df76260de794",
+ "sha256:6831e1ac20ac52634da606b658b0b2712d26984999c9d93f0c6e59fe62ca741b",
+ "sha256:77f0d9fa5e10d03aa4528436e33423bfa3718b86c646615f04616294c935f840",
+ "sha256:828ad813c7cdc2e71dcf141912c685bfe4b548c0e6d9540db6418b807c345ddd",
+ "sha256:85a06c61598b14b015d4df233d249cd5abfa61084ef5b9f64a48e997fd829a82",
+ "sha256:8cb4febad0f0b26c6f62e1628f2053954ad2c555d67660f28dfb1b0496711952",
+ "sha256:a5c58664b23b248b16b96253880b2868fb34358911400a7ba39d7f6399935389",
+ "sha256:aaa0f296e503cda4bc07566f592cd7a28779d433f3a23c48082af425d6d5a78f",
+ "sha256:ab235d9fe64833f12d1334d29b558aacedfbca2356dfb9691f2d0d38a8a7bfb4",
+ "sha256:b3b0c8f660fae65eac74fbf003f3103769b90012ae7a460863010539bb7a80da",
+ "sha256:bab8e6d510d2ea0f1d14f12642e3f35cefa47a9b2e4c7cea1852b52bc9c49647",
+ "sha256:c45297bbdbc8bb79b02cf41417d63352b70bcb76f1bbb1ee7d47b3e89e42f95d",
+ "sha256:d19bca47c8a01b92640c614a9147b081a1974f69168ecd494687c827109e8f42",
+ "sha256:d64b4340a0c488a9e79b66ec9f9d77d02b99b772c8b8afd46c1294c1d39ca478",
+ "sha256:da969da069a82bbb5300b59161d8d7c8d423bc4ccd3b410a9b4d8932aeefc14b",
+ "sha256:ed02c7539705696ecb7dc9d476d861f3904a8d2b7e894bd418994920935d36bb",
+ "sha256:ee5b8abc35b549012e03a7b1e86c09491457dba6c94112a2482b18589cc2bdb9"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*' and python_version < '4' and python_version != '3.2.*'",
+ "version": "==4.5.2"
+ },
+ "decorator": {
+ "hashes": [
+ "sha256:33cd704aea07b4c28b3eb2c97d288a06918275dac0ecebdaf1bc8a48d98adb9e",
+ "sha256:cabb249f4710888a2fc0e13e9a16c343d932033718ff62e1e9bc93a9d3a9122b"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "happybase-mock": {
+ "hashes": [
+ "sha256:8c91787865c869ac6f5269768a75f5ea0c846162cdd82c5cf3de7aa09ed67c3b",
+ "sha256:ebc0026169f2f4456121269524599087fb3f416d2362d824657c4ce8ec2c355e"
+ ],
+ "index": "ia",
+ "version": "==0.10.0"
+ },
+ "idna": {
+ "hashes": [
+ "sha256:2c6a5de3089009e3da7c5dde64a141dbc8551d5b7f6cf4ed7c2568d0cc520a8f",
+ "sha256:8c7309c718f94b3a625cb648ace320157ad16ff131ae0af362c9f21b80ef6ec4"
+ ],
+ "version": "==2.6"
+ },
+ "ipython": {
+ "hashes": [
+ "sha256:06de667a9e406924f97781bda22d5d76bfb39762b678762d86a466e63f65dc39",
+ "sha256:5d3e020a6b5f29df037555e5c45ab1088d6a7cf3bd84f47e0ba501eeb0c3ec82"
+ ],
+ "index": "ia",
+ "version": "==7.3.0"
+ },
+ "ipython-genutils": {
+ "hashes": [
+ "sha256:72dd37233799e619666c9f639a9da83c34013a73e8bbc79a7a6348d93c61fab8",
+ "sha256:eb2e116e75ecef9d4d228fdc66af54269afa26ab4463042e33785b887c628ba8"
+ ],
+ "version": "==0.2.0"
+ },
+ "isort": {
+ "hashes": [
+ "sha256:1153601da39a25b14ddc54955dbbacbb6b2d19135386699e2ad58517953b34af",
+ "sha256:b9c40e9750f3d77e6e4d441d8b0266cf555e7cdabdcff33c4fd06366ca761ef8",
+ "sha256:ec9ef8f4a9bc6f71eec99e1806bfa2de401650d996c59330782b89a5555c1497"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==4.3.4"
+ },
+ "jedi": {
+ "hashes": [
+ "sha256:571702b5bd167911fe9036e5039ba67f820d6502832285cde8c881ab2b2149fd",
+ "sha256:c8481b5e59d34a5c7c42e98f6625e633f6ef59353abea6437472c7ec2093f191"
+ ],
+ "version": "==0.13.2"
+ },
+ "lazy-object-proxy": {
+ "hashes": [
+ "sha256:0ce34342b419bd8f018e6666bfef729aec3edf62345a53b537a4dcc115746a33",
+ "sha256:1b668120716eb7ee21d8a38815e5eb3bb8211117d9a90b0f8e21722c0758cc39",
+ "sha256:209615b0fe4624d79e50220ce3310ca1a9445fd8e6d3572a896e7f9146bbf019",
+ "sha256:27bf62cb2b1a2068d443ff7097ee33393f8483b570b475db8ebf7e1cba64f088",
+ "sha256:27ea6fd1c02dcc78172a82fc37fcc0992a94e4cecf53cb6d73f11749825bd98b",
+ "sha256:2c1b21b44ac9beb0fc848d3993924147ba45c4ebc24be19825e57aabbe74a99e",
+ "sha256:2df72ab12046a3496a92476020a1a0abf78b2a7db9ff4dc2036b8dd980203ae6",
+ "sha256:320ffd3de9699d3892048baee45ebfbbf9388a7d65d832d7e580243ade426d2b",
+ "sha256:50e3b9a464d5d08cc5227413db0d1c4707b6172e4d4d915c1c70e4de0bbff1f5",
+ "sha256:5276db7ff62bb7b52f77f1f51ed58850e315154249aceb42e7f4c611f0f847ff",
+ "sha256:61a6cf00dcb1a7f0c773ed4acc509cb636af2d6337a08f362413c76b2b47a8dd",
+ "sha256:6ae6c4cb59f199d8827c5a07546b2ab7e85d262acaccaacd49b62f53f7c456f7",
+ "sha256:7661d401d60d8bf15bb5da39e4dd72f5d764c5aff5a86ef52a042506e3e970ff",
+ "sha256:7bd527f36a605c914efca5d3d014170b2cb184723e423d26b1fb2fd9108e264d",
+ "sha256:7cb54db3535c8686ea12e9535eb087d32421184eacc6939ef15ef50f83a5e7e2",
+ "sha256:7f3a2d740291f7f2c111d86a1c4851b70fb000a6c8883a59660d95ad57b9df35",
+ "sha256:81304b7d8e9c824d058087dcb89144842c8e0dea6d281c031f59f0acf66963d4",
+ "sha256:933947e8b4fbe617a51528b09851685138b49d511af0b6c0da2539115d6d4514",
+ "sha256:94223d7f060301b3a8c09c9b3bc3294b56b2188e7d8179c762a1cda72c979252",
+ "sha256:ab3ca49afcb47058393b0122428358d2fbe0408cf99f1b58b295cfeb4ed39109",
+ "sha256:bd6292f565ca46dee4e737ebcc20742e3b5be2b01556dafe169f6c65d088875f",
+ "sha256:cb924aa3e4a3fb644d0c463cad5bc2572649a6a3f68a7f8e4fbe44aaa6d77e4c",
+ "sha256:d0fc7a286feac9077ec52a927fc9fe8fe2fabab95426722be4c953c9a8bede92",
+ "sha256:ddc34786490a6e4ec0a855d401034cbd1242ef186c20d79d2166d6a4bd449577",
+ "sha256:e34b155e36fa9da7e1b7c738ed7767fc9491a62ec6af70fe9da4a057759edc2d",
+ "sha256:e5b9e8f6bda48460b7b143c3821b21b452cb3a835e6bbd5dd33aa0c8d3f5137d",
+ "sha256:e81ebf6c5ee9684be8f2c87563880f93eedd56dd2b6146d8a725b50b7e5adb0f",
+ "sha256:eb91be369f945f10d3a49f5f9be8b3d0b93a4c2be8f8a5b83b0571b8123e0a7a",
+ "sha256:f460d1ceb0e4a5dcb2a652db0904224f367c9b3c1470d5a7683c0480e582468b"
+ ],
+ "version": "==1.3.1"
+ },
+ "mccabe": {
+ "hashes": [
+ "sha256:ab8a6258860da4b6677da4bd2fe5dc2c659cff31b3ee4f7f5d64e79735b80d42",
+ "sha256:dd8d182285a0fe56bace7f45b5e7d1a6ebcbf524e8f3bd87eb0f125271b8831f"
+ ],
+ "version": "==0.6.1"
+ },
+ "more-itertools": {
+ "hashes": [
+ "sha256:0125e8f60e9e031347105eb1682cef932f5e97d7b9a1a28d9bf00c22a5daef40",
+ "sha256:590044e3942351a1bdb1de960b739ff4ce277960f2425ad4509446dbace8d9d1"
+ ],
+ "markers": "python_version > '2.7'",
+ "version": "==6.0.0"
+ },
+ "parso": {
+ "hashes": [
+ "sha256:4580328ae3f548b358f4901e38c0578229186835f0fa0846e47369796dd5bcc9",
+ "sha256:68406ebd7eafe17f8e40e15a84b56848eccbf27d7c1feb89e93d8fca395706db"
+ ],
+ "version": "==0.3.4"
+ },
+ "pathlib2": {
+ "hashes": [
+ "sha256:25199318e8cc3c25dcb45cbe084cc061051336d5a9ea2a12448d3d8cb748f742",
+ "sha256:5887121d7f7df3603bca2f710e7219f3eca0eb69e0b7cc6e0a022e155ac931a7"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.3.3"
+ },
+ "pexpect": {
+ "hashes": [
+ "sha256:2a8e88259839571d1251d278476f3eec5db26deb73a70be5ed5dc5435e418aba",
+ "sha256:3fbd41d4caf27fa4a377bfd16fef87271099463e6fa73e92a52f92dfee5d425b"
+ ],
+ "markers": "sys_platform != 'win32'",
+ "version": "==4.6.0"
+ },
+ "pickleshare": {
+ "hashes": [
+ "sha256:87683d47965c1da65cdacaf31c8441d12b8044cdec9aca500cd78fc2c683afca",
+ "sha256:9649af414d74d4df115d5d718f82acb59c9d418196b7b4290ed47a12ce62df56"
+ ],
+ "version": "==0.7.5"
+ },
+ "pluggy": {
+ "hashes": [
+ "sha256:8ddc32f03971bfdf900a81961a48ccf2fb677cf7715108f85295c67405798616",
+ "sha256:980710797ff6a041e9a73a5787804f848996ecaa6f8a1b1e08224a5894f2074a"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==0.8.1"
+ },
+ "prompt-toolkit": {
+ "hashes": [
+ "sha256:11adf3389a996a6d45cc277580d0d53e8a5afd281d0c9ec71b28e6f121463780",
+ "sha256:2519ad1d8038fd5fc8e770362237ad0364d16a7650fb5724af6997ed5515e3c1",
+ "sha256:977c6583ae813a37dc1c2e1b715892461fcbdaa57f6fc62f33a528c4886c8f55"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==2.0.9"
+ },
+ "ptyprocess": {
+ "hashes": [
+ "sha256:923f299cc5ad920c68f2bc0bc98b75b9f838b93b599941a6b63ddbc2476394c0",
+ "sha256:d7cc528d76e76342423ca640335bd3633420dc1366f258cb31d05e865ef5ca1f"
+ ],
+ "version": "==0.6.0"
+ },
+ "py": {
+ "hashes": [
+ "sha256:bf92637198836372b520efcba9e020c330123be8ce527e535d185ed4b6f45694",
+ "sha256:e76826342cefe3c3d5f7e8ee4316b80d1dd8a300781612ddbc765c17ba25a6c6"
+ ],
+ "markers": "python_version != '3.0.*' and python_version != '3.3.*' and python_version != '3.1.*' and python_version >= '2.7' and python_version != '3.2.*'",
+ "version": "==1.7.0"
+ },
+ "pygments": {
+ "hashes": [
+ "sha256:5ffada19f6203563680669ee7f53b64dabbeb100eb51b61996085e99c03b284a",
+ "sha256:e8218dd399a61674745138520d0d4cf2621d7e032439341bc3f647bff125818d"
+ ],
+ "version": "==2.3.1"
+ },
+ "pylint": {
+ "hashes": [
+ "sha256:689de29ae747642ab230c6d37be2b969bf75663176658851f456619aacf27492",
+ "sha256:771467c434d0d9f081741fec1d64dfb011ed26e65e12a28fe06ca2f61c4d556c"
+ ],
+ "index": "ia",
+ "version": "==2.2.2"
+ },
+ "pytest": {
+ "hashes": [
+ "sha256:067a1d4bf827ffdd56ad21bd46674703fce77c5957f6c1eef731f6146bfcef1c",
+ "sha256:9687049d53695ad45cf5fdc7bbd51f0c49f1ea3ecfc4b7f3fde7501b541f17f4"
+ ],
+ "index": "ia",
+ "version": "==4.3.0"
+ },
+ "pytest-cov": {
+ "hashes": [
+ "sha256:0ab664b25c6aa9716cbf203b17ddb301932383046082c081b9848a0edf5add33",
+ "sha256:230ef817450ab0699c6cc3c9c8f7a829c34674456f2ed8df1fe1d39780f7c87f"
+ ],
+ "index": "ia",
+ "version": "==2.6.1"
+ },
+ "pytest-pythonpath": {
+ "hashes": [
+ "sha256:63fc546ace7d2c845c1ee289e8f7a6362c2b6bae497d10c716e58e253e801d62"
+ ],
+ "index": "ia",
+ "version": "==0.7.3"
+ },
+ "requests": {
+ "hashes": [
+ "sha256:502a824f31acdacb3a35b6690b5fbf0bc41d63a24a45c4004352b0242707598e",
+ "sha256:7bf2a778576d825600030a110f3c0e3e8edc51dfaafe1c146e39a2027784957b"
+ ],
+ "index": "ia",
+ "version": "==2.21.0"
+ },
+ "responses": {
+ "hashes": [
+ "sha256:c85882d2dc608ce6b5713a4e1534120f4a0dc6ec79d1366570d2b0c909a50c87",
+ "sha256:ea5a14f9aea173e3b786ff04cf03133c2dabd4103dbaef1028742fd71a6c2ad3"
+ ],
+ "index": "ia",
+ "version": "==0.10.5"
+ },
+ "six": {
+ "hashes": [
+ "sha256:3350809f0555b11f552448330d0b52d5f24c91a322ea4a15ef22629740f3761c",
+ "sha256:d16a0141ec1a18405cd4ce8b4613101da75da0e9a7aec5bdd4fa804d0e0eba73"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==1.12.0"
+ },
+ "traitlets": {
+ "hashes": [
+ "sha256:9c4bd2d267b7153df9152698efb1050a5d84982d3384a37b2c1f7723ba3e7835",
+ "sha256:c6cb5e6f57c5a9bdaa40fa71ce7b4af30298fbab9ece9815b5d995ab6217c7d9"
+ ],
+ "markers": "python_version != '3.0.*' and python_version >= '2.6' and python_version != '3.1.*'",
+ "version": "==4.3.2"
+ },
+ "typed-ast": {
+ "hashes": [
+ "sha256:035a54ede6ce1380599b2ce57844c6554666522e376bd111eb940fbc7c3dad23",
+ "sha256:037c35f2741ce3a9ac0d55abfcd119133cbd821fffa4461397718287092d9d15",
+ "sha256:049feae7e9f180b64efacbdc36b3af64a00393a47be22fa9cb6794e68d4e73d3",
+ "sha256:19228f7940beafc1ba21a6e8e070e0b0bfd1457902a3a81709762b8b9039b88d",
+ "sha256:2ea681e91e3550a30c2265d2916f40a5f5d89b59469a20f3bad7d07adee0f7a6",
+ "sha256:3a6b0a78af298d82323660df5497bcea0f0a4a25a0b003afd0ce5af049bd1f60",
+ "sha256:5385da8f3b801014504df0852bf83524599df890387a3c2b17b7caa3d78b1773",
+ "sha256:606d8afa07eef77280c2bf84335e24390055b478392e1975f96286d99d0cb424",
+ "sha256:69245b5b23bbf7fb242c9f8f08493e9ecd7711f063259aefffaeb90595d62287",
+ "sha256:6f6d839ab09830d59b7fa8fb6917023d8cb5498ee1f1dbd82d37db78eb76bc99",
+ "sha256:730888475f5ac0e37c1de4bd05eeb799fdb742697867f524dc8a4cd74bcecc23",
+ "sha256:9819b5162ffc121b9e334923c685b0d0826154e41dfe70b2ede2ce29034c71d8",
+ "sha256:9e60ef9426efab601dd9aa120e4ff560f4461cf8442e9c0a2b92548d52800699",
+ "sha256:af5fbdde0690c7da68e841d7fc2632345d570768ea7406a9434446d7b33b0ee1",
+ "sha256:b64efdbdf3bbb1377562c179f167f3bf301251411eb5ac77dec6b7d32bcda463",
+ "sha256:bac5f444c118aeb456fac1b0b5d14c6a71ea2a42069b09c176f75e9bd4c186f6",
+ "sha256:bda9068aafb73859491e13b99b682bd299c1b5fd50644d697533775828a28ee0",
+ "sha256:d659517ca116e6750101a1326107d3479028c5191f0ecee3c7203c50f5b915b0",
+ "sha256:eddd3fb1f3e0f82e5915a899285a39ee34ce18fd25d89582bc89fc9fb16cd2c6"
+ ],
+ "markers": "python_version < '3.7' and implementation_name == 'cpython'",
+ "version": "==1.3.1"
+ },
+ "urllib3": {
+ "hashes": [
+ "sha256:06330f386d6e4b195fbfc736b297f58c5a892e4440e54d294d7004e3a9bbea1b",
+ "sha256:cc44da8e1145637334317feebd728bd869a35285b93cbb4cca2577da7e62db4f"
+ ],
+ "markers": "python_version >= '3.4'",
+ "version": "==1.22"
+ },
+ "wcwidth": {
+ "hashes": [
+ "sha256:3df37372226d6e63e1b1e1eda15c594bca98a22d33a23832a90998faa96bc65e",
+ "sha256:f4ebe71925af7b40a864553f761ed559b43544f8f71746c2d756c7fe788ade7c"
+ ],
+ "version": "==0.1.7"
+ },
+ "wrapt": {
+ "hashes": [
+ "sha256:4aea003270831cceb8a90ff27c4031da6ead7ec1886023b80ce0dfe0adf61533"
+ ],
+ "version": "==1.11.1"
+ }
+ }
+}
diff --git a/python_hadoop/README.md b/python_hadoop/README.md
new file mode 100644
index 0000000..198c949
--- /dev/null
+++ b/python_hadoop/README.md
@@ -0,0 +1,104 @@
+
+Hadoop streaming map/reduce jobs written in python using the mrjob library.
+
+## Development and Testing
+
+System dependencies on Linux (ubuntu/debian):
+
+ sudo apt install -y python3-dev python3-pip python3-wheel libjpeg-dev build-essential
+ pip3 install --user pipenv
+
+On macOS (using Homebrew):
+
+ brew install libjpeg pipenv
+
+You probably need `~/.local/bin` on your `$PATH`.
+
+Fetch all python dependencies with:
+
+ pipenv install --dev
+
+Run the tests with:
+
+ pipenv run pytest
+
+Check test coverage with:
+
+ pytest --cov --cov-report html
+ # open ./htmlcov/index.html in a browser
+
+## Troubleshooting
+
+If you get pipenv errors like:
+
+ AttributeError: '_NamespacePath' object has no attribute 'sort'
+
+ ----------------------------------------
+
+ Command "python setup.py egg_info" failed with error code 1 in /1/tmp/pip-install-h7lb6tqz/proto-google-cloud-datastore-v1/
+
+ ☤ ▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉▉ 0/8 — 00:00:03
+ bnewbold@bnewbold-dev$
+ bnewbold@bnewbold-dev$ pipenv install --deploy --dev
+ Installing dependencies from Pipfile.lock (e82980)…
+ An error occurred while installing proto-google-cloud-logging-v2==0.91.3! Will try again.
+ An error occurred while installing gapic-google-cloud-error-reporting-v1beta1==0.15.3! Will try again.
+ An error occurred while installing gapic-google-cloud-datastore-v1==0.15.3! Will try again.
+ An error occurred while installing proto-google-cloud-datastore-v1==0.90.4! Will try again.
+
+Then something has gone horribly wrong with your pip/pipenv/python setup. Don't
+have a good workaround yet.
+
+## Running Python Jobs on Hadoop
+
+The `../please` script automates these steps; you should use that instead.
+
+When running python streaming jobs on the actual hadoop cluster, we need to
+bundle along our python dependencies in a virtual env tarball. Building this
+tarball can be done like:
+
+ export PIPENV_VENV_IN_PROJECT=1
+ pipenv install --deploy
+ tar -czf venv-current.tar.gz -C .venv .
+
+### Extraction Task
+
+An example actually connecting to HBase from a local machine, with thrift
+running on a devbox and GROBID running on a dedicated machine:
+
+ ./extraction_cdx_grobid.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
+ tests/files/example.cdx
+
+Running from the cluster (once a ./venv-current.tar.gz tarball exists):
+
+ ./extraction_cdx_grobid.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --grobid-uri http://wbgrp-svc096.us.archive.org:8070 \
+ -r hadoop \
+ -c mrjob.conf \
+ --archive venv-current.tar.gz#venv \
+ hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
+
+### Backfill Task
+
+An example actually connecting to HBase from a local machine, with thrift
+running on a devbox:
+
+ ./backfill_hbase_from_cdx.py \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ tests/files/example.cdx
+
+Running from the cluster (once a ./venv-current.tar.gz tarball exists):
+
+ ./backfill_hbase_from_cdx.py \
+ --hbase-host wbgrp-svc263.us.archive.org \
+ --hbase-table wbgrp-journal-extract-0-qa \
+ -r hadoop \
+ -c mrjob.conf \
+ --archive venv-current.tar.gz#venv \
+ hdfs:///user/bnewbold/journal_crawl_cdx/citeseerx_crawl_2017.cdx
diff --git a/python_hadoop/backfill_hbase_from_cdx.py b/python_hadoop/backfill_hbase_from_cdx.py
new file mode 100755
index 0000000..6b2ec0b
--- /dev/null
+++ b/python_hadoop/backfill_hbase_from_cdx.py
@@ -0,0 +1,88 @@
+#!/usr/bin/env python3
+"""
+Streaming Hadoop script to import CDX metadata into the HBase fulltext table,
+primarily for URL-agnostic crawl de-duplication. Takes only "fulltext" file
+formats.
+
+Requires:
+- happybase
+- mrjob
+"""
+
+import json
+import happybase
+import mrjob
+from mrjob.job import MRJob
+from common import parse_cdx_line
+
+
+class MRCDXBackfillHBase(MRJob):
+
+ # CDX lines in; JSON status out
+ INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
+ OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol
+
+ def configure_args(self):
+ super(MRCDXBackfillHBase, self).configure_args()
+
+ self.add_passthru_arg('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ self.add_passthru_arg('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+
+ def __init__(self, *args, **kwargs):
+ super(MRCDXBackfillHBase, self).__init__(*args, **kwargs)
+ self.mime_filter = ['application/pdf']
+ self.hb_table = None
+
+ def mapper_init(self):
+
+ if self.hb_table:
+ return
+
+ try:
+ host = self.options.hbase_host
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.options.hbase_table)
+
+ def mapper(self, _, raw_cdx):
+
+ self.increment_counter('lines', 'total')
+
+ if (raw_cdx.startswith(' ') or raw_cdx.startswith('filedesc') or
+ raw_cdx.startswith('#')):
+ self.increment_counter('lines', 'invalid')
+ yield _, dict(status="invalid", reason="line prefix")
+ return
+
+ info = parse_cdx_line(raw_cdx)
+ if info is None:
+ self.increment_counter('lines', 'invalid')
+ yield _, dict(status="invalid")
+ return
+
+ if info['file:mime'] not in self.mime_filter:
+ self.increment_counter('lines', 'skip')
+ yield _, dict(status="skip", reason="unwanted mimetype")
+ return
+
+ key = info.pop('key')
+ info['f:c'] = json.dumps(info['f:c'], sort_keys=True, indent=None)
+ info['file:cdx'] = json.dumps(info['file:cdx'],
+ sort_keys=True, indent=None)
+
+ self.hb_table.put(key, info)
+ self.increment_counter('lines', 'success')
+
+ yield _, dict(status="success")
+
+if __name__ == '__main__': # pragma: no cover
+ MRCDXBackfillHBase.run()
diff --git a/python_hadoop/common.py b/python_hadoop/common.py
new file mode 100644
index 0000000..e596b35
--- /dev/null
+++ b/python_hadoop/common.py
@@ -0,0 +1,99 @@
+
+import json
+from datetime import datetime
+
+NORMAL_MIME = (
+ 'application/pdf',
+ 'application/postscript',
+ 'text/html',
+ 'text/xml',
+)
+
+def normalize_mime(raw):
+ raw = raw.lower()
+ for norm in NORMAL_MIME:
+ if raw.startswith(norm):
+ return norm
+
+ # Special cases
+ if raw.startswith('application/xml'):
+ return 'text/xml'
+ if raw.startswith('application/x-pdf'):
+ return 'application/pdf'
+ return None
+
+
+def test_normalize_mime():
+ assert normalize_mime("asdf") is None
+ assert normalize_mime("application/pdf") == "application/pdf"
+ assert normalize_mime("application/pdf+journal") == "application/pdf"
+ assert normalize_mime("Application/PDF") == "application/pdf"
+ assert normalize_mime("application/p") is None
+ assert normalize_mime("application/xml+stuff") == "text/xml"
+ assert normalize_mime("application/x-pdf") == "application/pdf"
+ assert normalize_mime("application/x-html") is None
+
+
+def parse_cdx_line(raw_cdx):
+
+ cdx = raw_cdx.split()
+ if len(cdx) < 11:
+ return None
+
+ surt = cdx[0]
+ dt = cdx[1]
+ url = cdx[2]
+ mime = normalize_mime(cdx[3])
+ http_status = cdx[4]
+ key = cdx[5]
+ c_size = cdx[8]
+ offset = cdx[9]
+ warc = cdx[10]
+
+ if not (key.isalnum() and c_size.isdigit() and offset.isdigit()
+ and http_status == "200" and len(key) == 32 and dt.isdigit()
+ and mime != None):
+ return None
+
+ if '-' in (surt, dt, url, mime, http_status, key, c_size, offset, warc):
+ return None
+
+ key = "sha1:{}".format(key)
+
+ info = dict(surt=surt, dt=dt, url=url, c_size=int(c_size),
+ offset=int(offset), warc=warc)
+
+ warc_file = warc.split('/')[-1]
+ try:
+ dt_iso = datetime.strptime(dt, "%Y%m%d%H%M%S").isoformat()
+ except Exception:
+ return None
+
+ # 'i' intentionally not set
+ heritrix = dict(u=url, d=dt_iso, f=warc_file, o=int(offset), c=1)
+ return {'key': key, 'file:mime': mime, 'file:cdx': info, 'f:c': heritrix}
+
+def parse_ungrobided_line(raw_line):
+
+ line = raw_line.strip().split("\t")
+ if len(line) != 4:
+ return None
+
+ key = line[0]
+ mime = normalize_mime(line[2])
+ try:
+ f_c = json.loads(line[1])
+ cdx = json.loads(line[3])
+ except json.JSONDecodeError:
+ return None
+
+ if not (key[5:].isalnum() and len(key) == 37 and mime != None):
+ print(mime)
+ print(key)
+ print("FAIL")
+ return None
+
+ if '-' in (key, mime, f_c, cdx):
+ return None
+
+ return {'key': key, 'file:mime': mime, 'file:cdx': cdx, 'f:c': f_c}
diff --git a/python_hadoop/extraction_cdx_grobid.py b/python_hadoop/extraction_cdx_grobid.py
new file mode 100755
index 0000000..88580e1
--- /dev/null
+++ b/python_hadoop/extraction_cdx_grobid.py
@@ -0,0 +1,299 @@
+#!/usr/bin/env python3
+"""
+Streaming Hadoop script to import extract metadata and body from fulltext (eg,
+PDF) files using GROBID. Input is a CDX file; results primarly go to HBase,
+with status written to configurable output stream.
+
+Fulltext files are loaded directly from WARC files in petabox, instead of going
+through the wayback replay.
+
+Requires:
+- happybase
+- mrjob
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import xml
+import json
+import raven
+import struct
+import requests
+import happybase
+import mrjob
+from mrjob.job import MRJob
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+from common import parse_cdx_line
+from grobid2json import teixml2json
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_DENYLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
+
+class MRExtractCdxGrobid(MRJob):
+
+ # CDX lines in; JSON status out
+ #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
+ #INPUT_PROTOCOL = mrjob.protocol.RawProtocol
+ INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
+ OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol
+
+ def configure_args(self):
+ super(MRExtractCdxGrobid, self).configure_args()
+
+ self.add_passthru_arg('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ self.add_passthru_arg('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+ self.add_passthru_arg('--grobid-uri',
+ type=str,
+ default='http://localhost:8070',
+ help='URI of GROBID API Server')
+ self.add_passthru_arg('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+ self.add_passthru_arg('--force-existing',
+ action="store_true",
+ help='Re-processes (with GROBID) existing lines')
+
+ def __init__(self, *args, **kwargs):
+ super(MRExtractCdxGrobid, self).__init__(*args, **kwargs)
+ self.hb_table = None
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.mime_filter = ['application/pdf']
+ self.rstore = None
+
+ def grobid_process_fulltext(self, content):
+ r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
+ files={'input': content})
+ return r
+
+ def mapper_init(self):
+
+ if self.hb_table:
+ return
+
+ sentry_client.tags_context(dict(hbase_table=self.options.hbase_table))
+ try:
+ host = self.options.hbase_host
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.options.hbase_table)
+
+ def parse_line(self, raw_cdx):
+
+ if (raw_cdx.startswith(' ') or raw_cdx.startswith('filedesc') or
+ raw_cdx.startswith('#')):
+ return None, dict(status="invalid", reason="line prefix")
+
+ info = parse_cdx_line(raw_cdx)
+ if info is None:
+ return None, dict(status="invalid", reason="CDX parse")
+
+ if info['file:mime'] not in self.mime_filter:
+ return None, dict(status="skip", reason="mimetype")
+
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(info['file:cdx']['warc'].split('/')) != 2:
+ return None, dict(status="skip", reason="WARC path not petabox item/file")
+
+ return info, None
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.options.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.options.warc_uri_prefix))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def extract(self, info):
+
+ # Fetch data from WARCs in petabox
+ original_content, status = self.fetch_warc_content(
+ info['file:cdx']['warc'],
+ info['file:cdx']['offset'],
+ info['file:cdx']['c_size'])
+ if status:
+ return None, status
+
+ info['file:size'] = len(original_content)
+
+ # Submit to GROBID
+ try:
+ grobid_response = self.grobid_process_fulltext(original_content)
+ except requests.exceptions.ConnectionError:
+ return None, dict(status="error", reason="connection to GROBID worker")
+
+ info['grobid0:status_code'] = grobid_response.status_code
+
+ # 4 MByte XML size limit; don't record GROBID status on this path
+ if len(grobid_response.content) > 4000000:
+ info['grobid0:status'] = {'status': 'oversize'}
+ return info, dict(status="oversize", reason="TEI response was too large")
+
+ if grobid_response.status_code != 200:
+ # response.text is .content decoded as utf-8
+ info['grobid0:status'] = dict(status='error', description=grobid_response.text)
+ return info, dict(status="error", reason="non-200 GROBID HTTP status",
+ extra=grobid_response.text)
+
+ info['grobid0:status'] = {'status': 'partial'}
+ info['grobid0:tei_xml'] = grobid_response.content
+
+ # Convert TEI XML to JSON
+ try:
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
+ except xml.etree.ElementTree.ParseError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
+ except ValueError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
+
+ tei_metadata = info['grobid0:tei_json'].copy()
+ for k in ('body', 'annex'):
+ # Remove fulltext (copywritted) content
+ tei_metadata.pop(k, None)
+ info['grobid0:metadata'] = tei_metadata
+
+ # Determine extraction "quality"
+ # TODO:
+
+ info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
+
+ return info, None
+
+ @sentry_client.capture_exceptions
+ def mapper(self, _, raw_cdx):
+ """
+ 1. parse CDX line
+ 2. check what is already in hbase
+ 3. fetch data from wayback
+ 4. submit to GROBID
+ 5. convert GROBID response to JSON (and metadata)
+ 6. determine "quality"
+ 7. push results to hbase
+ """
+
+ self.increment_counter('lines', 'total')
+
+ # Parse line and filter down
+ info, status = self.parse_line(raw_cdx)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ yield _, status
+ return
+ key = info['key']
+ if key in KEY_DENYLIST:
+ self.increment_counter('lines', 'denylist')
+ yield _, dict(status='denylist', key=key)
+ return
+
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+
+ # Check if we've already processed this line
+ oldrow = self.hb_table.row(key,
+ columns=[b'f:c', b'file', b'grobid0:status_code'])
+ if (oldrow.get(b'grobid0:status_code', None) != None
+ and not self.options.force_existing):
+ # This file has already been processed; skip it
+ self.increment_counter('lines', 'existing')
+ yield _, dict(status="existing", key=key)
+ return
+
+ # Do the extraction
+ info, status = self.extract(info)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ status['key'] = key
+ yield _, status
+ return
+ extraction_status = status
+
+ # Decide what to bother inserting back into HBase
+ # Particularly: ('f:c', 'file:mime', 'file:size', 'file:cdx')
+ grobid_status_code = info.get('grobid0:status_code', None)
+ for k in list(info.keys()):
+ if k.encode('utf-8') in oldrow:
+ info.pop(k)
+
+ # Convert fields to binary
+ for k in list(info.keys()):
+ if info[k] is None:
+ info.pop(k)
+ elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json',
+ 'grobid0:metadata'):
+ assert type(info[k]) == dict
+ info[k] = json.dumps(info[k], sort_keys=True, indent=None)
+ elif k in ('file:size', 'grobid0:status_code'):
+ # encode as int64 in network byte order
+ if info[k] != {} and info[k] != None:
+ info[k] = struct.pack('!q', info[k])
+
+ key = info.pop('key')
+ self.hb_table.put(key, info)
+ self.increment_counter('lines', 'success')
+
+ if extraction_status is not None:
+ yield _, dict(status="partial", key=key,
+ grobid_status_code=grobid_status_code,
+ reason=extraction_status['reason'])
+ else:
+ yield _, dict(status="success",
+ grobid_status_code=grobid_status_code, key=key,
+ extra=extraction_status)
+
+
+if __name__ == '__main__': # pragma: no cover
+ MRExtractCdxGrobid.run()
diff --git a/python_hadoop/extraction_ungrobided.py b/python_hadoop/extraction_ungrobided.py
new file mode 100755
index 0000000..225e46f
--- /dev/null
+++ b/python_hadoop/extraction_ungrobided.py
@@ -0,0 +1,292 @@
+#!/usr/bin/env python3
+"""
+Variant of extraction_cdx_grobid which takes a partial metadata list as input
+instead of CDX.
+
+This task list is dumped by another Hadoop job which scans over the HBase table
+quickly, which allows this job to skip a (relatively) expensive HBase read
+per-row.
+
+Requires:
+- happybase
+- mrjob
+- wayback/GWB libraries
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import os
+import xml
+import json
+import raven
+import struct
+import requests
+import happybase
+import mrjob
+from mrjob.job import MRJob
+import wayback.exception
+from http.client import IncompleteRead
+from wayback.resourcestore import ResourceStore
+from gwb.loader import CDXLoaderFactory
+
+from common import parse_ungrobided_line
+from grobid2json import teixml2json
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_DENYLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
+
+class MRExtractUnGrobided(MRJob):
+
+ # "ungrobided" TSV lines in; JSON status out
+ #HADOOP_INPUT_FORMAT = 'org.apache.hadoop.mapred.lib.NLineInputFormat'
+ #INPUT_PROTOCOL = mrjob.protocol.RawProtocol
+ INPUT_PROTOCOL = mrjob.protocol.RawValueProtocol
+ OUTPUT_PROTOCOL = mrjob.protocol.JSONValueProtocol
+
+ def configure_args(self):
+ super(MRExtractUnGrobided, self).configure_args()
+
+ self.add_passthru_arg('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ self.add_passthru_arg('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+ self.add_passthru_arg('--grobid-uri',
+ type=str,
+ default='http://localhost:8070',
+ help='URI of GROBID API Server')
+ self.add_passthru_arg('--warc-uri-prefix',
+ type=str,
+ default='https://archive.org/serve/',
+ help='URI where WARCs can be found')
+
+ def __init__(self, *args, **kwargs):
+ super(MRExtractUnGrobided, self).__init__(*args, **kwargs)
+ self.hb_table = None
+ self.petabox_webdata_secret = kwargs.get('petabox_webdata_secret', os.environ.get('PETABOX_WEBDATA_SECRET'))
+ self.mime_filter = ['application/pdf']
+ self.rstore = None
+
+ def grobid_process_fulltext(self, content):
+ r = requests.post(self.options.grobid_uri + "/api/processFulltextDocument",
+ files={'input': content})
+ return r
+
+ def mapper_init(self):
+
+ if self.hb_table:
+ return
+
+ sentry_client.tags_context(dict(hbase_table=self.options.hbase_table))
+ try:
+ host = self.options.hbase_host
+ # TODO: make these configs accessible from... mrconf.cfg?
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.options.hbase_table)
+
+ def parse_ungrobided_line(self, raw_line):
+ """Line should be TSV and have non-null fields:
+
+ - key (string)
+ - f:c (string, json)
+ - file:mime (string)
+ - file:cdx (string, json)
+ """
+
+ if (raw_line.startswith(' ') or raw_line.startswith('#')):
+ return None, dict(status="invalid", reason="line prefix", input=raw_line)
+
+ info = parse_ungrobided_line(raw_line)
+ if info is None:
+ return None, dict(status="invalid", reason="ungrobided parse")
+
+ if info['file:mime'] not in self.mime_filter:
+ return None, dict(status="skip", reason="mimetype", mimetype=info['file:mime'])
+
+ # If warc is not item/file.(w)arc.gz form, skip it
+ if len(info['file:cdx']['warc'].split('/')) != 2:
+ return None, dict(status="skip", reason="WARC path not petabox item/file", path=info['file:cdx']['warc'])
+
+ return info, None
+
+ def fetch_warc_content(self, warc_path, offset, c_size):
+ warc_uri = self.options.warc_uri_prefix + warc_path
+ if not self.rstore:
+ self.rstore = ResourceStore(loaderfactory=CDXLoaderFactory(
+ webdata_secret=self.petabox_webdata_secret,
+ download_base_url=self.options.warc_uri_prefix))
+ try:
+ gwb_record = self.rstore.load_resource(warc_uri, offset, c_size)
+ except wayback.exception.ResourceUnavailable:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ResourceUnavailable)")
+ except ValueError as ve:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (ValueError: {})".format(ve))
+ except EOFError as eofe:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (EOFError: {})".format(eofe))
+ except TypeError as te:
+ return None, dict(status="error",
+ reason="failed to load file contents from wayback/petabox (TypeError: {}; likely a bug in wayback python code)".format(te))
+ # Note: could consider a generic "except Exception" here, as we get so
+ # many petabox errors. Do want jobs to fail loud and clear when the
+ # whole cluster is down though.
+
+ if gwb_record.get_status()[0] != 200:
+ return None, dict(status="error",
+ reason="archived HTTP response (WARC) was not 200",
+ warc_status=gwb_record.get_status()[0])
+
+ try:
+ raw_content = gwb_record.open_raw_content().read()
+ except IncompleteRead as ire:
+ return None, dict(status="error",
+ reason="failed to read actual file contents from wayback/petabox (IncompleteRead: {})".format(ire))
+ return raw_content, None
+
+ def extract(self, info):
+
+ # Fetch data from WARCs in petabox
+ original_content, status = self.fetch_warc_content(
+ info['file:cdx']['warc'],
+ info['file:cdx']['offset'],
+ info['file:cdx']['c_size'])
+ if status:
+ return None, status
+
+ info['file:size'] = len(original_content)
+
+ # Submit to GROBID
+ try:
+ grobid_response = self.grobid_process_fulltext(original_content)
+ except requests.exceptions.ConnectionError:
+ return None, dict(status="error", reason="connection to GROBID worker")
+
+ info['grobid0:status_code'] = grobid_response.status_code
+
+ # 4 MByte XML size limit; don't record GROBID status on this path
+ if len(grobid_response.content) > 4000000:
+ info['grobid0:status'] = {'status': 'oversize'}
+ return info, dict(status="oversize", reason="TEI response was too large")
+
+ if grobid_response.status_code != 200:
+ # response.text is .content decoded as utf-8
+ info['grobid0:status'] = dict(status='error', description=grobid_response.text)
+ return info, dict(status="error", reason="non-200 GROBID HTTP status",
+ extra=grobid_response.text)
+
+ info['grobid0:status'] = {'status': 'partial'}
+ info['grobid0:tei_xml'] = grobid_response.content
+
+ # Convert TEI XML to JSON
+ try:
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
+ except xml.etree.ElementTree.ParseError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
+ except ValueError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
+
+ tei_metadata = info['grobid0:tei_json'].copy()
+ for k in ('body', 'annex'):
+ # Remove fulltext (copywritted) content
+ tei_metadata.pop(k, None)
+ info['grobid0:metadata'] = tei_metadata
+
+ # Determine extraction "quality"
+ # TODO:
+
+ info['grobid0:quality'] = None
+ info['grobid0:status'] = {'status': 'success'}
+
+ return info, None
+
+ @sentry_client.capture_exceptions
+ def mapper(self, _, raw_line):
+ """
+ 1. parse filtered line
+ 2. fetch data from wayback
+ 3. submit to GROBID
+ 4. convert GROBID response to JSON (and metadata)
+ 6. determine "quality"
+ 6. push results to hbase
+ """
+
+ self.increment_counter('lines', 'total')
+
+ # Parse line and filter down
+ info, status = self.parse_ungrobided_line(raw_line)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ yield _, status
+ return
+ key = info['key']
+ if key in KEY_DENYLIST:
+ self.increment_counter('lines', 'denylist')
+ yield _, dict(status='denylist', key=key)
+ return
+
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+
+ # Do the extraction
+ info, status = self.extract(info)
+ if info is None:
+ self.increment_counter('lines', status['status'])
+ status['key'] = key
+ yield _, status
+ return
+ extraction_status = status
+
+ # Decide what to bother inserting back into HBase
+ # Basically, don't overwrite backfill fields.
+ grobid_status_code = info.get('grobid0:status_code', None)
+ for k in list(info.keys()):
+ if k in ('f:c', 'file:mime', 'file:cdx'):
+ info.pop(k)
+
+ # Convert fields to binary
+ for k in list(info.keys()):
+ if info[k] is None:
+ info.pop(k)
+ # NOTE: we're not actually sending these f:*, file:* keys...
+ elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json',
+ 'grobid0:metadata'):
+ assert type(info[k]) == dict
+ info[k] = json.dumps(info[k], sort_keys=True, indent=None)
+ elif k in ('file:size', 'grobid0:status_code'):
+ # encode as int64 in network byte order
+ if info[k] != {} and info[k] != None:
+ info[k] = struct.pack('!q', info[k])
+
+ key = info.pop('key')
+ self.hb_table.put(key, info)
+ self.increment_counter('lines', 'success')
+
+ if extraction_status is not None:
+ yield _, dict(status="partial", key=key,
+ grobid_status_code=grobid_status_code,
+ reason=extraction_status['reason'])
+ else:
+ yield _, dict(status="success",
+ grobid_status_code=grobid_status_code, key=key,
+ extra=extraction_status)
+
+
+if __name__ == '__main__': # pragma: no cover
+ MRExtractUnGrobided.run()
diff --git a/python_hadoop/grobid2json.py b/python_hadoop/grobid2json.py
new file mode 100755
index 0000000..f3577b0
--- /dev/null
+++ b/python_hadoop/grobid2json.py
@@ -0,0 +1,154 @@
+#!/usr/bin/env python3
+
+"""
+NB: adapted to work as a library for PDF extraction. Will probably be
+re-written eventually to be correct, complete, and robust; this is just a
+first iteration.
+
+This script tries to extract everything from a GROBID TEI XML fulltext dump:
+
+- header metadata
+- affiliations
+- references (with context)
+- abstract
+- fulltext
+- tables, figures, equations
+
+A flag can be specified to disable copyright encumbered bits (--no-emcumbered):
+
+- abstract
+- fulltext
+- tables, figures, equations
+
+Prints JSON to stdout, errors to stderr
+"""
+
+import io
+import json
+import argparse
+import xml.etree.ElementTree as ET
+
+ns = "http://www.tei-c.org/ns/1.0"
+
+def all_authors(elem):
+ names = []
+ for e in elem.findall('.//{%s}author/{%s}persName' % (ns, ns)):
+ given_name = e.findtext('./{%s}forename' % ns) or None
+ surname = e.findtext('./{%s}surname' % ns) or None
+ full_name = '{} {}'.format(given_name or '', surname or '').strip()
+ names.append(dict(name=full_name, given_name=given_name, surname=surname))
+ return names
+
+
+def journal_info(elem):
+ journal = dict()
+ journal['name'] = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+ journal['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if journal['publisher'] == '':
+ journal['publisher'] = None
+ journal['issn'] = elem.findtext('.//{%s}idno[@type="ISSN"]' % ns)
+ journal['eissn'] = elem.findtext('.//{%s}idno[@type="eISSN"]' % ns)
+ journal['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ journal['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ return journal
+
+
+def biblio_info(elem):
+ ref = dict()
+ ref['id'] = elem.attrib.get('{http://www.w3.org/XML/1998/namespace}id')
+ # Title stuff is messy in references...
+ ref['title'] = elem.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+ other_title = elem.findtext('.//{%s}monogr/{%s}title' % (ns, ns))
+ if other_title:
+ if ref['title']:
+ ref['journal'] = other_title
+ else:
+ ref['journal'] = None
+ ref['title'] = other_title
+ ref['authors'] = all_authors(elem)
+ ref['publisher'] = elem.findtext('.//{%s}publicationStmt/{%s}publisher' % (ns, ns))
+ if ref['publisher'] == '':
+ ref['publisher'] = None
+ date = elem.find('.//{%s}date[@type="published"]' % ns)
+ ref['date'] = (date != None) and date.attrib.get('when')
+ ref['volume'] = elem.findtext('.//{%s}biblScope[@unit="volume"]' % ns)
+ ref['issue'] = elem.findtext('.//{%s}biblScope[@unit="issue"]' % ns)
+ el = elem.find('.//{%s}ptr[@target]' % ns)
+ if el is not None:
+ ref['url'] = el.attrib['target']
+ # Hand correction
+ if ref['url'].endswith(".Lastaccessed"):
+ ref['url'] = ref['url'].replace(".Lastaccessed", "")
+ else:
+ ref['url'] = None
+ return ref
+
+
+def teixml2json(content, encumbered=True):
+
+ if type(content) == str:
+ content = io.StringIO(content)
+ elif type(content) == bytes:
+ content = io.BytesIO(content)
+
+ info = dict()
+
+ #print(content)
+ #print(content.getvalue())
+ tree = ET.parse(content)
+ tei = tree.getroot()
+
+ header = tei.find('.//{%s}teiHeader' % ns)
+ if header is None:
+ raise ValueError("XML does not look like TEI format")
+ application_tag = header.findall('.//{%s}appInfo/{%s}application' % (ns, ns))[0]
+ info['grobid_version'] = application_tag.attrib['version']
+ info['grobid_timestamp'] = application_tag.attrib['when']
+ info['title'] = header.findtext('.//{%s}analytic/{%s}title' % (ns, ns))
+ info['authors'] = all_authors(header.find('.//{%s}sourceDesc/{%s}biblStruct' % (ns, ns)))
+ info['journal'] = journal_info(header)
+ date = header.find('.//{%s}date[@type="published"]' % ns)
+ info['date'] = (date != None) and date.attrib.get('when')
+ info['fatcat_release'] = header.findtext('.//{%s}idno[@type="fatcat"]' % ns)
+ info['doi'] = header.findtext('.//{%s}idno[@type="DOI"]' % ns)
+ if info['doi']:
+ info['doi'] = info['doi'].lower()
+
+ refs = []
+ for (i, bs) in enumerate(tei.findall('.//{%s}listBibl/{%s}biblStruct' % (ns, ns))):
+ ref = biblio_info(bs)
+ ref['index'] = i
+ refs.append(ref)
+ info['citations'] = refs
+
+ if encumbered:
+ el = tei.find('.//{%s}profileDesc/{%s}abstract' % (ns, ns))
+ info['abstract'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}text/{%s}body' % (ns, ns))
+ info['body'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="acknowledgement"]' % (ns, ns))
+ info['acknowledgement'] = (el or None) and " ".join(el.itertext()).strip()
+ el = tei.find('.//{%s}back/{%s}div[@type="annex"]' % (ns, ns))
+ info['annex'] = (el or None) and " ".join(el.itertext()).strip()
+
+ return info
+
+def main(): # pragma no cover
+ parser = argparse.ArgumentParser(
+ description="GROBID TEI XML to JSON",
+ usage="%(prog)s [options] <teifile>...")
+ parser.add_argument("--no-encumbered",
+ action="store_true",
+ help="don't include ambiguously copyright encumbered fields (eg, abstract, body)")
+ parser.add_argument("teifiles", nargs='+')
+
+ args = parser.parse_args()
+
+ for filename in args.teifiles:
+ content = open(filename, 'r')
+ print(json.dumps(
+ teixml2json(content,
+ encumbered=(not args.no_encumbered))))
+
+if __name__=='__main__': # pragma no cover
+ main()
diff --git a/python_hadoop/kafka_grobid_hbase.py b/python_hadoop/kafka_grobid_hbase.py
new file mode 100755
index 0000000..b52c386
--- /dev/null
+++ b/python_hadoop/kafka_grobid_hbase.py
@@ -0,0 +1,200 @@
+#!/usr/bin/env python3
+"""
+Kafka worker that consumes GROBID output from Kafka and pushes into HBase.
+
+Based on the ungrobided Hadoop job code.
+
+TODO: binary conversion in 'grobided' topic? shouldn't be, do that here, as well as all TEI extraction/parsing
+
+Requires:
+- requests
+- pykafka
+"""
+
+# XXX: some broken MRO thing going on in here due to python3 object wrangling
+# in `wayback` library. Means we can't run pylint.
+# pylint: skip-file
+
+import sys
+import xml
+import json
+import raven
+import struct
+import requests
+import argparse
+import happybase
+import pykafka
+
+from common import parse_ungrobided_line
+from grobid2json import teixml2json
+
+# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
+sentry_client = raven.Client()
+
+# Specific poison-pill rows we should skip
+KEY_DENYLIST = (
+ 'sha1:DLCCSMMVTCCIR6LRXHEQLZ4PWO6NG2YT', # "failed to guess ARC header format"
+)
+
+class KafkaGrobidHbaseWorker:
+
+ def __init__(self, kafka_hosts, consume_topic, **kwargs):
+ self.consume_topic = consume_topic
+ self.consumer_group = kwargs.get('consumer_group', 'grobid-hbase-insert2')
+ self.kafka_hosts = kafka_hosts or 'localhost:9092'
+ self.hbase_host = kwargs['hbase_host']
+ self.hbase_table_name = kwargs['hbase_table']
+ self.hb_table = None # connection initialized in run()
+
+ def convert_tei(self, info):
+
+ # Convert TEI XML to JSON
+ try:
+ info['grobid0:tei_json'] = teixml2json(info['grobid0:tei_xml'], encumbered=True)
+ except xml.etree.ElementTree.ParseError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML parse error")
+ return info, info['grobid0:status']
+ except ValueError:
+ info['grobid0:status'] = dict(status="fail", reason="GROBID 200 XML non-TEI content")
+ return info, info['grobid0:status']
+
+ tei_metadata = info['grobid0:tei_json'].copy()
+ for k in ('body', 'annex'):
+ # Remove fulltext (copywritted) content
+ tei_metadata.pop(k, None)
+ info['grobid0:metadata'] = tei_metadata
+ return info, None
+
+ def do_work(self, raw_line):
+ """
+ 1. parse info JSON (with XML inside)
+ 2. do XML -> JSON conversions
+ 3. push to HBase
+
+ Returns: ???
+ """
+
+ # Parse line and filter down
+ info = json.loads(raw_line)
+ key = info['key']
+ if key in KEY_DENYLIST:
+ #self.increment_counter('lines', 'denylist')
+ return None, dict(status='denylist', key=key)
+
+ # Note: this may not get "cleared" correctly
+ sentry_client.extra_context(dict(row_key=key))
+ print("inserting line to HBase: {}".format(key))
+
+ if info.get('grobid0:tei_xml'):
+ # Need to decode 'str' back in to 'bytes' (from JSON serialization)
+ info['grobid0:tei_xml'] = info['grobid0:tei_xml'].encode('utf-8')
+
+ if info.get('grobid0:status') == 200 and info.get('grobid0:tei_xml'):
+ info, status = self.convert_tei(info)
+
+ # Decide what to bother inserting back into HBase
+ # Basically, don't overwrite backfill fields.
+ grobid_status_code = info.get('grobid0:status_code', None)
+ for k in list(info.keys()):
+ if k in ('f:c', 'file:mime', 'file:cdx'):
+ info.pop(k)
+
+ # Convert fields to binary
+ for k in list(info.keys()):
+ if info[k] is None:
+ info.pop(k)
+ # NOTE: we're not actually sending these f:*, file:* keys...
+ elif k in ('f:c', 'file:cdx', 'grobid0:status', 'grobid0:tei_json',
+ 'grobid0:metadata'):
+ assert type(info[k]) == dict
+ info[k] = json.dumps(info[k], sort_keys=True, indent=None)
+ elif k in ('file:size', 'grobid0:status_code'):
+ # encode as int64 in network byte order
+ if info[k] != {} and info[k] != None:
+ info[k] = struct.pack('!q', info[k])
+
+ key = info.pop('key')
+ self.hb_table.put(key, info)
+ #self.increment_counter('lines', 'success')
+
+ return info, dict(status="success",
+ grobid_status_code=grobid_status_code, key=key)
+
+ def run(self):
+
+ # 1. start consumer (in managed/balanced fashion, with consumer group)
+ # 2. for each thingie, do the work; if success publish to kafka; either
+ # way... print? log?
+ # 3. repeat!
+
+ print("Starting grobid-hbase-worker...")
+ try:
+ host = self.hbase_host
+ hb_conn = happybase.Connection(host=host, transport="framed",
+ protocol="compact")
+ except Exception:
+ raise Exception("Couldn't connect to HBase using host: {}".format(host))
+ self.hb_table = hb_conn.table(self.hbase_table_name)
+ print("HBase inserting into {}".format(self.hbase_table_name))
+
+ kafka = pykafka.KafkaClient(hosts=self.kafka_hosts, broker_version="2.0.0")
+ consume_topic = kafka.topics[self.consume_topic]
+
+ sequential_failures = 0
+ consumer = consume_topic.get_balanced_consumer(
+ consumer_group=self.consumer_group,
+ managed=True,
+ auto_commit_enable=True,
+ # needed to avoid MessageSet decode errors
+ fetch_message_max_bytes=4*1024*1024,
+ # LATEST because best to miss processing than waste time re-process
+ auto_offset_reset=pykafka.common.OffsetType.LATEST,
+ compacted_topic=True)
+ print("Kafka consuming {} in group {}".format(
+ self.consume_topic,
+ self.consumer_group))
+
+ for msg in consumer:
+ #print("got a line! ")
+ grobid_output, status = self.do_work(msg.value.decode('utf-8'))
+ if grobid_output:
+ sequential_failures = 0
+ else:
+ sys.stderr.write("Failed to process GROBID extraction output: {}\n".format(status))
+ sequential_failures += 1
+ if sequential_failures > 20:
+ sys.stderr.write("too many failures in a row, bailing out\n")
+ sys.exit(-1)
+
+
+@sentry_client.capture_exceptions
+def main():
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--kafka-hosts',
+ default="localhost:9092",
+ help="list of Kafka brokers (host/port) to use")
+ parser.add_argument('--kafka-env',
+ default="qa",
+ help="eg, 'qa' or 'prod'")
+ parser.add_argument('--consume-topic',
+ default=None,
+ help="Kafka topic to consume from")
+ parser.add_argument('--hbase-table',
+ type=str,
+ default='wbgrp-journal-extract-0-qa',
+ help='HBase table to backfill into (must exist)')
+ parser.add_argument('--hbase-host',
+ type=str,
+ default='localhost',
+ help='HBase thrift API host to connect to')
+ args = parser.parse_args()
+
+ if args.consume_topic is None:
+ args.consume_topic = "sandcrawler-{}.grobid-output".format(args.kafka_env)
+
+ worker = KafkaGrobidHbaseWorker(**args.__dict__)
+ worker.run()
+
+if __name__ == '__main__': # pragma: no cover
+ main()
diff --git a/python_hadoop/mrjob.conf b/python_hadoop/mrjob.conf
new file mode 100644
index 0000000..6f36196
--- /dev/null
+++ b/python_hadoop/mrjob.conf
@@ -0,0 +1,16 @@
+runners:
+ local:
+ upload_files:
+ - common.py
+ - grobid2json.py
+ setup:
+ - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/
+ hadoop:
+ no_output: true
+ upload_files:
+ - common.py
+ - grobid2json.py
+ setup:
+ - export PYTHONPATH=$PYTHONPATH:venv/lib/python3.5/site-packages/
+ cmdenv:
+ SENTRY_DSN: https://6ab6ad080d034280b863f294e07cc5c6:414ebf0b68634f669d2dc00d7c935699@books-sentry.us.archive.org/9
diff --git a/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml b/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
new file mode 100644
index 0000000..dbc8be5
--- /dev/null
+++ b/python_hadoop/tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml
@@ -0,0 +1,2004 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">DYNAMICS OF RAILWAY FREIGHT VEHICLES</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2015">2015</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Original Citation Iwnicki</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">University of Huddersfield Repository</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">DYNAMICS OF RAILWAY FREIGHT VEHICLES</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dynamics of railway freight vehicles. Vehicle System Dynamics. pp. 1­39. ISSN 0042­3114</title>
+ <imprint>
+ <date type="published" when="2015">2015</date>
+ </imprint>
+ </monogr>
+ <note>Dynamics of railway freight vehicles</note>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Freight wagon</term>
+ <term>Vehicle dynamics</term>
+ <term>Computer simulation</term>
+ <term>Rail Freight</term>
+ <term>Running Gear Design</term>
+ <term>Freight Bogies</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>This paper summarises the historical development of railway freight vehicles and how vehicle designers have tackled the difficult challenges of producing running gear which can accommodate the very high tare to laden mass of typical freight wagons whilst maintaining stable running at the maximum required speed and good curving performance. The most common current freight bogies are described in detail and recent improvements in techniques used to simulate the dynamic behaviour of railway vehicles are summarised and examples of how these have been used to improve freight vehicle dynamic behaviour are included. A number of recent developments and innovative components and sub systems are outlined and finally two new developments are presented in more detail: the LEILA bogie and the SUSTRAIL bogie.</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>DYNAMICS OF RAILWAY FREIGHT VEHICLES</head><p>Iwnicki S.D. 1 , Stichel S. <ref type="bibr" target="#b1">2</ref> , Orlova A. 3 , Hecht M. <ref type="bibr" target="#b3">4</ref> </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>From their inception railways have been predominant in the carriage of bulk goods and railway wagons have been designed to allow this to be effected efficiently on different types of railway infrastructure. In more recent times with changes in industrial needs and competition from road and air transport railways have carried an ever declining share of freight. Although there is some evidence in some countries that this trend has started to change recently due to road congestion there is still not yet a widespread evidence of a major modal shift from road to rail which politicians have indicated is desirable. For example the European Transport White paper 2011 <ref type="bibr" target="#b0">[1]</ref> sets a target for modal shift of 30% by 2030 and 50% by 2050 from road freight to other modes such as rail or waterborne transport for distances over 300 km.</p><p>The barriers to this increased modal shift from road to rail seem to be largely due to the requirements from modern shippers for shorter end-to-end times but even more the demand is for high reliability of service and for additional features such as tracking and tracing of shipments, security and temperature control. As Hecht <ref type="bibr" target="#b1">[2]</ref> points out the lower speeds for rail freight compared with passenger services are not mainly related to lower vehicle speed capability but are more due to the fact that freight trains often travel on lower speed lines or are held for passenger traffic to pass and due to complex and lengthy shunting and handling operations and motive power and crew changes.</p><p>Nevertheless if freight vehicle speeds and acceleration and braking capabilities could allow them to be fully integrated with passenger traffic this would bring a step change in end to end freight train speeds as well as overall system capacity. A key factor in obtaining this increased speed is to ensure that the dynamic performance of freight vehicles can allow safe and reliable operation on track with different levels of irregularities and support conditions. Running gear has evolved with the experience of operation on different railways and more recently the use of computer simulation tools and several standardised designs are now ubiquitous. Several research projects and teams have recently been trying to advance from this position using innovative designs adapted from passenger vehicles or using other novel techniques. The use of computer simulations is now established for design of running gear and is also becoming accepted as part of the vehicle acceptance processes in many countries.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">Early developments of freight wagons</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Background</head><p>Designers of freight vehicle running gear face many challenges but not least of these is the fact that the ratio of the laden to tare mass of a freight vehicle can be as much as 5:1 compared with a more manageable 1.5:1 for typical passenger vehicles. This effectively means that the suspension system has to be designed for two different vehicles (and every stage in between). A number of clever designs have evolved over the years and the most successful of these are now summarised.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">UIC double link</head><p>Freight wagons with link type suspensions have existed for more than 100 years, as can be seen in <ref type="figure" target="#fig_0">figure 1</ref>, and the link suspension is probably still the most common suspension type for two axle freight wagons in Europe today. As early as 1890 the principle of the link suspension was defined as a standard. A review of freight wagons with link suspension can be found in <ref type="bibr" target="#b2">[3]</ref>. After World War II the UIC double link suspension was defined as a standard <ref type="bibr" target="#b4">[5]</ref>. In the beginning of the 1980s a number of improvements were made. The axle load was increased to 22.5 tonnes and the parabolic leaf spring was introduced as standard component <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>. The UIC double link suspension in figure 2 mainly consists of three parts: Leaf springs, links and axle guards. The vehicle is connected to the parabolic or leaf spring by double links. The leaf spring rests on the axle box. This arrangement allows the axle box to move in both the longitudinal and lateral direction relative to the wagon body. The axle guard restricts the horizontal motion of the axle box. The principle of the suspension is that of a pendulum. In the longitudinal direction the suspension links are inclined, whereas in the lateral direction they are in a vertical plane when the vehicle body is in nominal position <ref type="bibr" target="#b0">[1]</ref>, <ref type="bibr" target="#b7">[8]</ref>, <ref type="bibr" target="#b8">[9]</ref>, <ref type="bibr" target="#b9">[10]</ref>. The characteristics of the double-link suspension are quite complex. The main components are shown in <ref type="figure">Figure 3</ref>. One of the main advantages of the link running gear is that it is simple, robust and cheap and also takes up little space in both lateral and vertical directions. Both stiffness and damping are provided by one system and are load dependent. The quasistatic curving performance of the single axle running gear with link suspension is good. For a typical two-axle freight wagon with a wheelbase of 9m on dry rails good steering performance down to 300 m curve radius can be achieved <ref type="bibr" target="#b9">[10]</ref>.</p><p>The running behaviour of two-axle freight wagons with link suspension can be rather poor mainly due to vehicle hunting. The amount of damping provided in the horizontal plane is often not sufficient. Additionally the characteristics of the suspension change during the life of the vehicle, due to suspension wear, and with the running conditions <ref type="bibr" target="#b9">[10]</ref>. The link suspension takes quite a lot of longitudinal space and is a poor isolator for sound and vibration.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.3">Link suspension bogies</head><p>The leaf spring and link suspension of the single-axle running gear has also been used on bogies since about 1925 <ref type="bibr" target="#b0">[1]</ref>. More recently it has been standardised with for example bogie type 931 (figure 4), developed in the 1950s by Deutsche Bahn with a wheelbase of 2000 mm and a wheel diameter of 1000 mm. This bogie was developed to run at 100 km/h with an axle load of 20 t and was the first bogie standardised by UIC <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>. In the beginning of the 1980s DB bogie type 665 was introduced with new features like parabolic leaf springs, 22.5 t permissible axle load and shorter links as shown in figure 5 <ref type="bibr" target="#b6">[7]</ref>. The bogie frame is a welded steel design but in some places forged components are used. The frame is connected to parabolic or trapezoidal leaf springs, that rest on the axlebox, being connected by swing links. Nominally the suspension links are positioned in a longitudinal vertical plane and inclined in this plane. During vehicle operation the links swing in that plane and also laterally <ref type="bibr" target="#b0">[1]</ref>, <ref type="bibr" target="#b5">[6]</ref>, <ref type="bibr" target="#b6">[7]</ref>, <ref type="bibr" target="#b10">[11]</ref>. A spherical centre-pivot and two side bearers connect the bogie frame and the wagon body. The side bearers can be either rigid or vertically suspended and have three functions:</p><p> to act as static support for the carbody.  to act as roll stiffness.  to provide friction damping between carbody and bogie The quasistatic curving performance of a bogie with link suspension is generally very good due to:</p><p> the short wheelset distance in the bogie of 1.8 m.  the soft longitudinal primary suspension.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.4">The Y25 Standard Bogie</head><p>Most railway vehicles have bogies or trucks which allow longer vehicles supported on two bogies while still keeping attack angles between wheels and rail in curves to reasonable levels. This arrangement also allows two stages of suspension with the 'primary' suspension between wheelset and bogie and secondary suspension between bogie and coach or wagon body. The primary suspension can isolate the bogie from short wavelength irregularities while the secondary suspension deals with the longer wavelength, lower frequency excitations.</p><p>As previously mentioned, a specific challenge for designers of freight vehicle running gear is the large difference between tare and laden vehicle mass. In the Y25 bogie progressive damping with vertical load is effected by the use of 'Lenoir links' which take part of the vertical load through an angled link and a pusher onto a vertical friction surface. This gives a level of damping which is broadly proportional to the vehicle mass. The Y25 bogie design originated in France in 1948 and was standardised by the ORE steering committee in 1967. It is shown in <ref type="figure">figure 6</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 6: A Y25 type bogie</head><p>The design has been hugely successful and Y25 type bogies are the most predominant freight bogie in Europe.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.5">'three-piece' Freight Bogies</head><p>The three-piece bogies were first developed in 1930s and seemed to originate simultaneously in the USA (Barber bogie) and the Soviet Union (Hanin bogie). Now the three-piece bogie and its more sophisticated descendents are the most common suspension for freight wagons across North and South Americas, CIS countries, China, Africa, India and Australia. Maximum axle loads range between 7 and 36 t. The most common standards for three-piece bogies are AAR <ref type="bibr" target="#b12">[13]</ref> for 1435 mm gauge and GOST <ref type="bibr" target="#b13">[14]</ref> for 1520 mm gauge. A review of three-piece bogies can be found in <ref type="bibr" target="#b14">[15]</ref>.</p><p>The Russian model 18-100 bogie shown in <ref type="figure">figure 7</ref> is a good example of an early type of three-piece bogie. The term 'three-piece' refers to the design of the bogie frame which consists of three interconnected parts: two side frames and one bolster. The frame parts are usually cast.</p><p>The bogie is equipped with central suspension between the side frames and the bolster that consists of a set of springs and wedge friction dampers working in vertical and lateral direction and keeping the frame square. The side frames with their flat surfaces rest on the axle-boxes (or bearing adapters). The size of the opening in the side frame provides clearances in longitudinal and lateral direction within which the axle-box moves resisted by dry friction forces. The car body rests on the flat center bowl, its roll motion relative to the bolster is limited by side bearers which are usually stiff vertical stops including clearance when the wagon body is in the central position. The three-piece bogie is a very robust design with the advantage of being low cost in production, operation and repair. The following items are considered as disadvantages of traditional three-piece bogie and attempts have been made to address these in its further developments <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b15">[16]</ref>, <ref type="bibr" target="#b16">[17]</ref>:</p><p> Limited critical speed of the empty wagon )with sway oscillation of car body being the major loss of stability mode);  Wheel flange contact in curves produced by warping between side frames and bolster;  Side frames adding to the unsprung mass and thus increasing track impact on short wavelength irregularities;  Deterioration of ride performance with wear of friction wedges and other friction surfaces.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3">Computer simulation</head><p>Computer simulation of freight vehicles is not at all as common as for passenger vehicles. Since many of the European freight vehicles are standardized very little new development has been carried out and the manufacturers do in general not perform a simulation analysis of the running behaviour of freight wagon. However, in several research groups at universities and research institutes and at some consulting companies computer simulation of freight vehicles is now performed.</p><p>Since manufacturers do not usually build simulation models of freight vehicles themselves one of the main challenges in modelling a freight wagon is to obtain all the input parameters required. Another aspect is that most suspension elements are strongly non-linear and in many cases even mathematically non-smooth. This makes it very difficult to build up simulation models that provide good results compared to measurement results. Some of the phenomena observed during simulation of freight vehicles will be discussed below.</p><p>Further, as described in Section 3.1, the characteristics of the suspension elements can vary during operation due to wear or environmental effects such as for example surface contamination changing the friction coefficient in sliding surfaces.</p><p>The main purpose of simulation studies of freight vehicles is very often a stability analysis (see Section 3.2) or an investigation of the curving behaviour of the freight wagon (see Section 3.3). Since the axle loads of freight wagons are usually high, the investigation of wheel or rail wear and rolling contact fatigue is often the primary reason for a simulation study in curves.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1">Suspension components</head><p>The suspension in most freight vehicles relies on friction damping. Friction elements are low cost, require little maintenance and are usually load dependent. This means that the level of friction damping changes with axle load, an important feature in freight wagons due to the high tare to laden ratio already mentioned. Surveys of modelling of friction components in freight wagon can be found for example in <ref type="bibr" target="#b17">[18]</ref>- <ref type="bibr" target="#b21">[22]</ref>. Papers <ref type="bibr" target="#b17">[18]</ref> and <ref type="bibr" target="#b18">[19]</ref> are general reviews of rail vehicle suspension components, while <ref type="bibr" target="#b19">[20]</ref> is focused on freight vehicles and also discusses issues such as stability and curving of freight vehicles. Papers <ref type="bibr" target="#b20">[21]</ref> and <ref type="bibr" target="#b21">[22]</ref> are focussed on modelling friction wedges of three-piece bogies. Also in the proceedings from the Euromech 500 colloquium <ref type="bibr" target="#b22">[23]</ref> many valuable contributions on the topic of non-smooth suspension elements can be found. Various arrangements of suspension elements to simulate vehicle suspensions are documented in <ref type="bibr" target="#b23">[24]</ref>, <ref type="bibr" target="#b24">[25]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.1">Friction damping</head><p>In most freight vehicle simulation models friction is modelled as dry Coulomb friction, where the friction force is proportional to the normal load. The friction coefficient is assumed to be constant, see force-deflection curve in <ref type="figure">figure 8</ref>, left. The disadvantage of the Coulomb model is that it is non-smooth, i.e. multi-valued and non-differentiable. Another way to model friction is with a linear spring in series with a friction slider as in <ref type="figure" target="#fig_8">figure 9</ref> with the resulting force-displacement characteristic in <ref type="figure">figure 8</ref>, right. Since most friction damper arrangements have a finite flexibility, such models could also be regarded as more realistic. Note, however that the model with a spring in series is still non-smooth. To avoid the difficulties mentioned above regularization methods are often applied, see for example <ref type="bibr" target="#b25">[26]</ref>, <ref type="bibr" target="#b26">[27]</ref> and <ref type="bibr" target="#b27">[28]</ref>. Piotrowski developed a non-smooth rheological model <ref type="bibr" target="#b28">[29]</ref>, <ref type="bibr" target="#b29">[30]</ref>, which employs the notion of the differential succession involving a contingent derivative of the non- smooth, multi-valued characteristics of Coulomb friction. Tan and Rogers <ref type="bibr" target="#b30">[31]</ref> proposed equivalent viscous damping models to avoid the numerical problems of Coulomb friction. They claim that this substitution works very well for cases where sliding motions predominate.</p><p>In many running gear arrangements two-dimensional friction elements are needed, e.g. in the Y25 and in the three-piece bogie. In these designs motions in two directions tangential to the friction surfaces are possible. Two-dimensional Coulomb friction models can be found e.g. in <ref type="bibr" target="#b31">[32]</ref>, <ref type="bibr" target="#b32">[33]</ref>.</p><p>Another phenomenon that is important to take into account is stochastic excitations that smooth the dry friction damping. Also mid frequency excitation generated in the wheel rail contact - often called dither - can smoothen dry friction and therefore have a significant influence on the simulation results, see for example <ref type="bibr" target="#b29">[30]</ref>, <ref type="bibr" target="#b32">[33]</ref>.</p><p>True and Asmund <ref type="bibr" target="#b32">[33]</ref> investigated the effects of dry friction in the suspension of a simple freight vehicle. They used a relatively simple model of dry friction and found that the stable behaviour for the system with friction exhibited a laterally oscillating motion which makes the system sensitive to external periodic forcing. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2">Wagons with link suspension</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.1">Basic model of leaf spring and link suspension</head><p>Leaf springs are often used as vertical suspension. In multibody simulation models they are usually regarded as rigid in both the longitudinal and lateral directions. For dynamic displacements around a static equilibrium position leaf springs are characterized by a relatively high stiffness for small displacements and a significantly lower stiffness for larger displacement, (figure 10). Leaf springs are described in the ORE reports <ref type="bibr" target="#b33">[34]</ref>, <ref type="bibr" target="#b34">[35]</ref>. Since link suspensions show very similar characteristics they are often modelled in a similar way to leaf springs, at least for the lateral link behaviour. The initial higher stiffness k 1 in leaf springs is caused by friction, i.e. the leaves of a leaf spring stick together for small displacements and start to slide on each other for larger displacements. In the same way the link rolls in the end bearing as long as there is no sliding in the contact area. The lower stiffness k 2 is the value for sliding in the leaf spring or the so called pendulum stiffness of a link. The force F d determines the amount of damping in the hysteresis. A commonly used model to represent the two different stiffness values with the hysteresis is to use a linear spring and a friction element in series, in parallel with another linear spring, as shown in <ref type="figure" target="#fig_0">figure 11</ref>. It should be taken into account that the characteristics of leaf springs vary due to wear in running or deterioration or lubrication state.</p><p>The three parameters in the model described above can be derived from measurements. This model, however, is simplified since the shape of the hysteresis curve is usually rounded as shown in <ref type="figure" target="#fig_0">figure 10</ref>. Measurement results and more detailed descriptions of link suspensions can be found in <ref type="bibr" target="#b33">[34]</ref>- <ref type="bibr" target="#b47">[48]</ref>. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.2.2">Advanced simulation models</head><p>For lateral displacements of a double-link all four joints are assumed to start to slide at the same time; therefore the model in <ref type="figure" target="#fig_0">figure 11</ref> is sufficient. In the longitudinal direction, however, it is more likely that the joints start to slide at different displacements as shown e.g. by Piotrowski <ref type="bibr" target="#b28">[29]</ref>. He uses a set of four sliders and spring elements with different breakout forces in parallel to describe these characteristics. Also in a model used by Stiepel several elements in parallel are used <ref type="bibr" target="#b43">[44]</ref>.</p><p>To give a better representation of the rounded shape of the hysteresis curves, Fancher developed a model for truck leaf springs <ref type="bibr" target="#b44">[45]</ref>, <ref type="bibr" target="#b45">[46]</ref> using exponential expressions. Jönsson <ref type="bibr" target="#b41">[42]</ref> used a similar approach, where the total force over the suspension component is separated into piece-wise elastic and friction forces. The model is used for both leaf springs and double-links.</p><p>Another possibility to describe hysteresis with rounded shape for link suspensions is to use rolling contact theory, which has been proposed by <ref type="bibr">Piotrowski [33]</ref>. Based on the slip velocity the creepage in the contact is calculated.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3">Modelling the three-piece bogie</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.1">Models of the central suspension</head><p>Most of the research in modelling three-piece bogies, such as <ref type="bibr" target="#b20">[21]</ref>, <ref type="bibr" target="#b21">[22]</ref>, is focussed on the central suspension element of the three-piece truck that provides damping with friction wedges. Early models of friction wedge suspensions recognized only vertical load-dependent friction force, later models included two-dimensional friction in the vertical and lateral directions <ref type="bibr" target="#b45">[46]</ref>, <ref type="bibr" target="#b49">[50]</ref>.</p><p>The first approach to account for possible angular and longitudinal displacements of bolster relative to the side frames is to introduce warping and longitudinal nonlinear resistance characteristics into the model, as it is done in <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>. In such case the wedges are not modelled as separate bodies, but the equivalent force against displacement characteristics are introduced accounting for wedge parameters, such as inclination angle, width of the vertical surface, width of the inclined surface, friction coefficients on inclined and vertical surfaces, etc.</p><p>The second approach to account for all possible degrees of freedom between side frame and bolster is to introduce multiple contact points mapped along the edges of the wedge with two-dimensional friction force elements in each of them. Such an approach was used by Ballew et al <ref type="bibr" target="#b45">[46]</ref>, it is implemented in simulation tools such as VAMPIRE <ref type="bibr" target="#b51">[52]</ref>, and the Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>. Numerous contact elements require an efficient numerical simulation algorithm to be implemented into the software that provides fast solution to resulting stiff system of equations, such as the one developed by Pogorelov <ref type="bibr" target="#b56">[57]</ref>. The wedges are treated as massless. Contact type models allow the study of such complicated phenomenon as uneven distribution of contact forces over the wedge surfaces, implementation of resilient pads on wedge surfaces, jamming and wedging <ref type="bibr" target="#b53">[54]</ref>. In paper <ref type="bibr" target="#b55">[56]</ref> the authors included the mass of the wedge into consideration to study its dynamic properties.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.2">Models of the axle to side frame interaction</head><p>In the first approach similar to friction wedges the axle to side frame interaction can be described by nonlinear equivalent characteristics as in <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>. The dry friction interaction between the axle box crown and the side frame pedestal is modelled by two dimensional dry friction element in parallel with another nonlinear element that describes bumpstops in longitudinal and lateral dimension. A typical characteristic of the bumpstop element is presented in <ref type="figure" target="#fig_0">figure 12</ref>. To improve numerical integration the transition from clearance to bumpstop is often smoothed.</p><p>If the interaction between the crown and pedestal is a flat surface, then its width can result in roll stiffness that is produced by gravity. Such stiffness can be introduced into the model depending on the axle load. The second approach is to introduce multiple contact points on the edges of the crown with two-dimensional friction elements in them. The bumpstops are then also the contact elements between the axle box or adapter and the stops in the side frame jaws. Such approach is used in <ref type="bibr" target="#b56">[57]</ref> as well as in Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.1.3.3">Models of the centre bowl and side bearers</head><p>The same approaches can be applied to models of the centre bowl to centre plate interaction and at the side bearers.</p><p>In the first approach, see <ref type="bibr" target="#b14">[15]</ref>, <ref type="bibr" target="#b16">[17]</ref>, centre plate to centre bowl interaction works simultaneously as one dimensional yaw friction and nonlinear roll and pitch torque with soft characteristics as shown in <ref type="figure" target="#fig_0">figure 13</ref>. Knowing the clearance in the side bearers the nonlinear roll characteristic can be linearized. The second approach is to introduce multiple contact points on the edges of the centre plate with two-dimensional friction elements in them. The interaction with the centre bowl rim is then also the contact elements. Such an approach is used in <ref type="bibr" target="#b56">[57]</ref> as well as in Universal Mechanism software <ref type="bibr" target="#b51">[52]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.2">Stability</head><p>Freight vehicles in most cases operate at much lower speeds than passenger vehicles. Typical running speeds are at around 100 km/h. This suggests that stability investigations are not as important as for faster passenger vehicles. On the other hand freight vehicles often are much less damped than passenger vehicles and stability investigations are therefore necessary. Several of the wagon types introduced above can - in unfavourable running conditions - show significant hunting behaviour at speeds as low as 70 km/h.</p><p>In a bogie vehicle basically three types of hunting motion can arise:</p><p> Wheelset hunting where one wheelset performs the hunting motion.</p><formula xml:id="formula_0"> M 0  M   c Mg  </formula><p> Bogie hunting where a whole bogie is taking over the hunting motion.  Carbody hunting where the carbody performs a yaw motion and the two bogies mainly follow the carbody with lateral motions, i.e. the whole vehicle takes over the hunting motion.</p><p>Carbody hunting is often a type of resonance phenomenon, where the Klingel hunting frequency given mainly by vehicle speed and conicity in the contact coincides with the yaw eigenfrequency of the carbody.</p><p>Hunting motion with a non-zero limit cycle depends on the wheel-rail geometry, the suspension and the masses and inertias of the vehicle. Since the mass and inertia, and in most cases the suspension stiffness and damping of the freight wagon will significantly change with load, the type of hunting motion observed usually differs between an empty and a loaded wagon. Since the stiffness values between axlebox and bogie frame (in a bogie vehicle) are lower in an unloaded vehicle, the risk for wheelset or bogie hunting is higher. In loaded vehicles, vehicle hunting can often be observed. Since the frequency of wheelset hunting is usually low (typically between 1 and 2 Hz) the wheel rail forces induced are relatively low and in most cases below the limit values stipulated in standards. Therefore, the vehicle design in reality allows for the carbody instability to happen in some conditions. Otherwise the suspension needs to be so stiff that the curving performance would suffer, and the amount of wear and RCF would increase significantly. The risk of carbody hunting can vary with the type of load since this can influence the yaw eigenfrequency of the carbody.</p><p>Due to the significant inherent non-linearity and non-smoothness of the suspension elements linearization of the models is usually not realistic. It is therefore necessary to perform time steppig integration with the full non-linear model. The task is in general to find the non-linear critical speed v B of the wagon as can be seen in the generic bifurcation diagram in figure 14.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 14: Generic bifurcation diagram</head><p>In complex models it is very difficult to find the exact critical speed, for example with a path following method <ref type="bibr" target="#b57">[58]</ref>. Therefore other engineering methods are used. One possibility that has been suggested e.g. by <ref type="bibr">Polach [59]</ref> is to excite the vehicle with an initial disturbance that can either be deterministic or stochastic. After the initial disturbance the vehicle is run on ideal smooth track. If the oscillation vanishes the vehicle is regarded as stable. The simulations have to be repeated with increasing speed until the oscillations do not disappear. In that case the non-linear critical speed v b ( <ref type="figure" target="#fig_0">figure 15</ref>) is reached. A risk with this method is that the initial disturbance is not high enough to initiate a limit cycle oscillation and that the critical speed detected is higher than the real non-linear critical speed.</p><p>Another method to detect the non-linear critical speed is start the simulations at a very high speed to be sure that the vehicle has reached the non-zero attractor (limit cycle). Then the speed is continuously reduced until the limit cycle behaviour disappears. Polach also describes this method. It has been used for example by Boronenko et al <ref type="bibr" target="#b14">[15]</ref> to tune the suspension of three-piece bogies.</p><p>A similar method, shown in <ref type="figure" target="#fig_0">figure 15</ref>, is suggested in <ref type="bibr" target="#b59">[60]</ref> to determine the so-called non-linear critical speed. The difference to the method introduced above is that the speed is not reduced continuously but in discrete steps as suggested by True <ref type="bibr" target="#b98">[98]</ref>. <ref type="figure" target="#fig_0">Figure 16</ref> shows the bifurcation diagram for a loaded two-axle vehicle calculated with this method. It can be observed that only the stable branches of the bifurcation diagram can be determined, not the unstable part. The zero solution is also possible at least up to a speed of 120 km/h (bold solid line). This was simulated using the procedure above, starting from low speed and increasing the speed stepwise. Hoffman also investigated the stability of a two-axle wagon with link suspension <ref type="bibr" target="#b42">[43]</ref>, <ref type="bibr" target="#b60">[61]</ref>. He uses the link model developed by Piotrowski <ref type="bibr" target="#b28">[29]</ref>. The leaf springs model is based on Fancher et al <ref type="bibr" target="#b45">[46]</ref>. Gialleonardo et al <ref type="bibr" target="#b61">[62]</ref> extended this type of stability analysis for a two-axle wagon with link suspension on curved track. As can be seen in <ref type="figure" target="#fig_0">figure 18</ref>. the leading wheelset (y lw )</p><p>shows much smaller oscillation amplitudes than the trailing wheelset (y tw ) and the carbody. This is because the outer wheel of the leading wheelset experiences flange contact. In general the results show the presence of large periodic oscillations in narrow curves at commercial operating speeds. It is also shown in the paper that the coupling forces between wagon assemblies significantly reduce the oscillation amplitudes. Zhai et al <ref type="bibr" target="#b62">[63]</ref> extended the stability analysis for a freight wagon with three-piece bogies to also include a visoelastic track structure. The stability analysis is performed according to the methodology suggested by Polach, which is explained above. The authors found that a lower critical hunting speed is obtained on elastic track compared with the rigid track case. The difference in the critical hunting speeds between the elastic track base and the rigid track base is 4.4% for the loaded freight car.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3">Curving</head><p>As indicated above simulations of the running behaviour of freight wagons in curves are often performed to investigate the risk of wheel wear and Rolling Contact Fatigue (RCF).</p><p>For passenger vehicles curving simulations are often performed on ideal track, i.e. the stochastic track irregularities are neglected. Authors are in this case interested in the quasistatic behaviour of the vehicle, i.e. the mean wheelset attack angles or the mean energy dissipation in the contact points. For freight vehicles with non-linear and non- smooth suspension this can lead to significant mistakes as shown in the example from Jönsson <ref type="bibr" target="#b41">[42]</ref>. On ideal track the friction surfaces might stick together and force the wheelset into a more unfavourable position. Track irregularities help to get relative motion in the friction surfaces, which usually leads to better - and more realistic - steering behaviour of the vehicle. As seen in <ref type="figure" target="#fig_0">figure 19</ref>, the energy dissipation as a measure for the amount of wear or RCF, is much lower when simulating running with track irregularities. In one of their numerous studies on three-piece bogies Boronenko et al <ref type="bibr" target="#b14">[15]</ref> investigate the reason for excessive flange wear in some of the Russian wagons. One conclusion is that the main reason for flange wear is the unstable behaviour of the bogies in curves (rutting mode) <ref type="bibr" target="#b15">[16]</ref>, when the bogie is flanging with a two-point contact situation instead of negotiating the curve using the wheel conicity. The flanging is the result of bogie warping, which increases the angle of attack compared to a radial position. In the article a number of different designs are discussed. Among others it is concluded that a bogie design with radial arms significantly reduces the angle of attack and the wear number in curves, see <ref type="figure" target="#fig_1">figure 20</ref>. Berghuvud [64] investigated the curving behaviour of different types of three-piece bogie with and without braking. He concluded that the influence of braking on the curving behaviour is complex. Braking can have a positive effect on the angle of attack of the wheelsets in a curve since it helps to overcome the static friction in the primary suspension. It can also increase the angle of attack if large longitudinal forces push the wheelset longitudinally towards the limit of the play and thus lock the wheelset in an unfavourable position.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.1">Vehicle Resistance</head><p>Radially steering bogies do not only reduce flange wear in curves but also reduce the required traction energy. The inner leading wheel is less affected and the trailing wheelset has much smaller values. With radial steering, ( <ref type="figure" target="#fig_1">figure 22</ref>) the leading axle also has very small creepages. This results in lower wear and running resistance. As a result on track with tight curves more than 20% of the overall running resistance can be reduced with similar levels of energy saving <ref type="bibr" target="#b65">[66]</ref>.</p><p>Of course radial steering may affect running stability on straight track. Therefore bogie designs with cross anchors such as the TVP 2007 or the Leila bogie have an advantage over individual radial steering axles as in the swing hanger bogie.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.3.2">Influence of curving on wheel and rail damage phenomena</head><p>As mentioned in the introduction to this section the curving performance of a freight wagon is very important for the level of wheel and rail damage. This means in turn that the vehicle track interaction in curves determines to a large extent the maintenance cost for the whole system. In <ref type="bibr" target="#b65">[66]</ref> Fröhling discusses the influence of, among others, bogie design, bogie maintenance and the wheel/rail interface in heavy haul operation on different damage phenomena on wheels and rails. In a later publication Fergusson et al <ref type="bibr" target="#b67">[67]</ref> present an analysis of wheel wear as a function of the relationship between the lateral and longitudinal primary suspension stiffness and the coefficient of friction at the centre plate between the wagon body and the bolster to minimise the wheel wear rate of a self-steering three-piece bogie without compromising vehicle stability. Simulation results indicate that wheel wear is theoretically the lowest for low lateral and longitudinal primary suspension stiffness and no friction at the centre plate. Casanueva et al <ref type="bibr" target="#b68">[68]</ref> extend the wear prediction methodology for freight wagons to also include switches and crossings. It is concluded that wear on some parts of the wheel profile can only be explained with running through switches.</p><p>Tunna and Urban <ref type="bibr" target="#b69">[69]</ref> carried out a parametric study to quantify the effects of various freight vehicle parameters on the generation of RCF. Three different freight suspensions wer considered: an enhanced three-piece bogie, a rigid-frame bogie with primary suspension, and a two-axle vehicle with leaf springs. Simulations were performed for track curvature ranging from 400 to 10 000 m. To judge the generation of RCF the Tgamma model from Burstow <ref type="bibr" target="#b70">[70]</ref> was used. It is stated that parameters that clearly need to be considered when evaluating rail surface damage are curve distribution, track quality, conicity, vehicle type and loading state of the wagon. Since several parameters are line dependent it is concluded that a route based analysis is necessary.</p><p>In <ref type="bibr" target="#b71">[71]</ref> a simulation model of an iron ore wagon with three-piece bogie is developed to investigate the risk of RCF on the Swedish and Norwegian iron ore line. 43 load cases with various conditions were used as inputs. The risk for RCF was estimated with the so-called shakedown map. The wear number, which is the product of creepages and creep forces, was calculated to estimate where initiated cracks develop or are worn away. In <ref type="figure" target="#fig_1">figure 23</ref> areas on the wheel profile with high risk of RCF can be seen. The area on the wheel tread coincides very well with field observations of RCF but the areas in the flange root and on the flange did not show RCF damage. It can be concluded that the energy dissipation is high enough to wear away initiated cracks. It seems that simulation of the curving behaviour of freight wagons can provide valuable information about the risk of wheel damage for specific operating conditions.</p><p>In <ref type="bibr" target="#b71">[71]</ref> a simulation by Dukkipati and Dong examine the effects of a freight wagon running over a dipped joint. In a very recent paper Wang and Gao investigate the wheel wear of a freight vehicle with three-piece bogie in curves <ref type="bibr" target="#b99">[99]</ref>. It is shown that wear is most severe on the outer leading wheel in the bogie. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="3.4">Parameter identification</head><p>The establishment of the correct parameters for use in computer models is clearly of great importance. Some parameters can easily be measured or provided by the manufacturers but others are very difficult to establish. Ren et el <ref type="bibr" target="#b74">[74]</ref> demonstrate the use of a test rig with a sliding plate underneath one wheelset to establish key parameters. The sliding plate is moved with actuators and forces measured to allow the lateral, shear and warp stiffness to be established as well as the friction characteristics of the bogie.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4">Modern Developments</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.1">The British Rail HSF Bogies</head><p>Wickens and colleagues at British Rail Research carried out theoretical and practical work aimed at understanding the dynamic performance of two axle freight vehicles <ref type="bibr" target="#b75">[75]</ref>, <ref type="bibr" target="#b76">[76]</ref>. The aim was to increase the operating speed of freight vehicles and reduce the rate of derailments. A series of experimental two axle vehicles were constructed to confirm the results of the analysis. They included coil springs and viscous dampers and longitudinal rods to control yaw motion and were initially tested on a full size roller rig.</p><p>Computer simulations of curving and stability were carried out with various damper configurations and on-track tests of several prototypes were undertaken</p><p>The result of this work was the prototype 'HSFV.4' high speed freight vehicle with viscous damping ( <ref type="figure" target="#fig_1">figure 24)</ref> which was tested at speeds of up to 120 km/h and proved to run without hunting for a wide range of effective conicity values. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.2">The Unitruck running gear</head><p>The Unitruck single-axle running gear with lateral "swing hangers" was first developed for the American market and in the 1990's adjusted to suit European conditions. Vehicles with Unitruck running gear <ref type="bibr" target="#b76">[76]</ref> are today used both in North America and Europe. They have only one stage suspension, which also includes friction damping. As in the Y25 bogie, the vertical force in the primary suspension is used to preload the different friction components via an inclined surface. <ref type="figure" target="#fig_1">Figure 25</ref> left shows the wedge element, which is in series with one of the coil springs and in contact with the carbody via an inclined friction surface; the vertical surface in contact with the saddle is also a friction surface. Newer designs have substituted the inclined friction surface by a roller (figure 25 left) <ref type="bibr" target="#b77">[77]</ref>, thus enabling the displacement in the longitudinal direction, but reducing longitudinal damping. Also, adding a coupling plate in the centre of the coil springs increases longitudinal stiffness <ref type="figure" target="#fig_1">(Figure 25</ref> right), which improves critical speed compared to the running gear with rollers and classic coil springs. The 'Swing Motion' bogie ( <ref type="figure" target="#fig_1">figure 26</ref>) is a variant of the three-piece freight bogie and was originally developed for heavy haul operations in North America. In the Swing Motion design an additional cross member or transom is included which connects the two side frames together via pivots at the base of the secondary spring pack. The bolster still sits on the top of the spring packs and is damped through friction wedges. A pivot between the axle boxes and the side frames is also included so that the side frames can pivot or swing to accommodate lateral motion of the bolster. The swing motion gives increased lateral stability at speeds up to 176 km/h and is claimed to reduce wheel and rail wear, reduce rolling resistance and forces on track and vehicle body compared with standard three-piece bogies.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.4">The 'LTF' bogie</head><p>In the 1980s British Rail Research in the UK developed a novel, track friendly bogie using passenger vehicle technology. The LTF25bogie is shown in <ref type="figure" target="#fig_1">figure 27</ref> and is described in <ref type="bibr" target="#b79">[79]</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Figure 27: The 'LTF25' bogie</head><p>The LTF25 bogie was specifically designed to reduce dynamic track forces and as part of this effort was made to reduce the unsprung mass. Small wheels (813 mm diameter) were used and inside axle boxes giving a 30% reduction in wheelset mass although this necessitated the use of on-board hotbox detectors.</p><p>Primary suspension is through steel coil springs and secondary suspension is through rubber spring elements and hydraulic dampers.</p><p>The high cost of the LTF25 bogie and concerns about axle fatigue with inboard axle boxes militated against its adoption but Powell Duffryn produced a modified version of the bogies known as the TF25 bogie (shown in <ref type="figure" target="#fig_1">figure 28</ref>) which has achieved considerable production success. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.5">The 'Gigabox' bogie</head><p>The 'Gigabox'bogie uses pedestal units containing progressive rubber springs with integral hydraulic damping as shown in figures 29 and 30). The system was developed by ContiTec and SKF and is claimed not to require maintenance for up to 1million km and to provide good noise and vibration isolation. A reduction of up to 20% in lateral forces is claimed as well as a 2 dB reduction in noise. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.6">The Double Rubber Ring Spring (DRRS) bogie</head><p>Originally designed by Talbot the DRRS bogie uses double rubber torroidal ring springs with load proportional friction damping as shown in <ref type="figure" target="#fig_0">figure 31</ref>. Container wagons with DRRS bogies entered service with the DB 'Inter Cargo Express- System'. Maximum axle-load ranges from 22.5 t at 100 km/h to 18.375 t at 160 km/h. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.7">Advances in three-piece bogies</head><p>The major drivers for advances of AAR three-piece bogies were tightening ride performance and track impact standards, such as M-1001 <ref type="bibr" target="#b79">[79]</ref> and M-976 <ref type="bibr" target="#b80">[80]</ref>, since 2000.</p><p>An overview of improvements in the suspensions is given in <ref type="bibr" target="#b82">[81]</ref>. Suspension springs tend to increase the deflection. Using higher control springs under the wedges increases friction under the empty wagon thus providing its better stability, and makes damping less dependent on the wear of wedges themselves. Different height of the inner and outer springs allows having lower lateral stiffness of the suspension under the empty wagon, thus improving its running performance. Using the set of 9 double springs per each side of the bogie increases warping resistance.</p><p>The innovative designs of the wedges are shown in <ref type="figure" target="#fig_1">figure 32</ref>. Both designs aim to increasing the warping resistance of the bogie. The split wedge consists of two symmetric parts inclined towards each other and interacts with the spatial insert in the bolster pocket. In the spatial wedge the surfaces are inclined in the other direction and they are wider than the vertical surface, which gives the same effect. In the interaction between the side frame and the wheelset axle various elastic components are introduced to reduce unsprung mass as well as to reduce resistance to wheelset displacement in plane, thus reducing the lateral track forces. Some of the designs of elastic shear pads are shown in <ref type="figure">figure 33</ref>. The rigid side bearings with clearances have transformed in modern three-piece bogies into constant contact side bearings, incorporating the elastic element compressed by the weight of the car body, <ref type="bibr" target="#b83">[82]</ref>. Examples of the design are shown in <ref type="figure" target="#fig_3">figure 34</ref>. Constant contact side bearings provide yaw damping for the bogies on straight track, as well as additional car body roll resistance for better curving performance. The rollers positioned with a clearance provide rigid bumpstop that limits the elastic element deflection without increasing the yaw resistance. There are several devices used to increase warping stiffness of three-piece bogies, the most common of which is using cross-braces between the side frames shown in <ref type="figure" target="#fig_4">figure 35</ref>.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Cap</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Elastic element</head><p>Cage Wear resistant element</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Insert</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Roller</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Cap</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head>Elastic element</head><p>Cage 1 - top brace; 2 - bottom brace; 3 - bolt; 4 - washer; 5 - nut; 6 - fastening unit; 7 - rings; 8 - locking plate; 9 - washer; 10 - bolt; 11 - elastic pad; 12 - safety wire; 13, 14 - bracket; 15, 16, 17 - plate; 18 - key <ref type="figure" target="#fig_4">Figure 35</ref> Cross-braces between side frames.</p><p>Using the concept of shear and bending stiffness of the bogie Scheffel <ref type="bibr" target="#b84">[83]</ref>, developed several novel designs of three-piece bogies (figure 36). At first the horizontal motion of the frame is decoupled from the wheelsets by horizontally soft primary suspension. Then the axle boxes are interconnected through sub-frames or arms by elastic elements that support their radial position in curves, but resist in-phase yaw <ref type="bibr" target="#b85">[84]</ref>. Scheffel bogies having the axle load of 32 t provide mileage between wheel turning of up to 1.5 million kilometres thus proving the high efficiency of the design to reduce track forces.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.8">The Lenoir pusher spring</head><p>Various alternatives to the double Lenoir linkage have been explored with the aim of providing reduced longitudinal stiffness at low cost. One example is the 'Lenoir pusher spring' which consists of a plunger and washer springs mounted opposite the Lenoir pusher ( <ref type="figure" target="#fig_32">figure 37</ref>). This allows more longitudinal motion than the conventional Piotrowski <ref type="bibr" target="#b87">[86]</ref> reports how this arrangement has been shown to give good performance in a prototype vehicle with significant reductions in wheel wear.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.9">The RC25NT Bogie</head><p>Eisenbahn Laufwerke Halle (Germany) has developed the RC25NT self-steering three- piece bogie with direct inter-axle linkages which was presented at the Innotrans exhibition in 2010) <ref type="bibr">[87]</ref> ( <ref type="figure" target="#fig_2">figure 38</ref>). The bogie has horizontally soft rubber bushes in the primary suspension and flexicoil dual rate springs with friction damping in the secondary suspension. The bogie is equipped with disk brakes. The aim of the development was to build a bogie capable of stable running up to 120 km/h, keeping low noise criteria and negotiating curves with minimum of wear. The bogie is designed to replace the Y25 type bogie without changes to the wagon body.</p><p>Simulations have shown that the RC25NT provides better stability on straight track than the Y25 (figure 39) and less wheel and rail wear in curves ( <ref type="figure" target="#fig_3">figure 40</ref>). The bogie was tested according to the UIC 518 standard in Sweden in 2010 for speeds up to 160 km/h. The RC25NT demonstrates that direct inter-axle linkages can allow freight car bogies to run at 120 km/h with proper steering and low wear in curves. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.10">The 'LEILA' Bogie</head><p>The LEILA bogie ('LEIchtes und LärmArmes GüterwagenDrehGestell' with the meaning of light and low noise freight bogie) is a passive radial steering bogie with a maximum axle load of 22.5 t and was developed between 2000 and 2005 during a German and Swiss research project <ref type="bibr" target="#b88">[88]</ref>. The Institute of Rail Vehicles of the Technische Universität Berlin was one of the involved partner. The aim to develop this bogie was:</p><p> to reduce the noise emissions of freight wagons;  to reduce the mass of a bogie to be under 4 t and  to reduce significantly wear and running resistance.</p><p>In addition:</p><p> the reliability and availability of freight wagons;  transparency in the transport chain;  the active and passive safety of the freight traffic and;  the transport velocity should be similarly increased <ref type="bibr" target="#b89">[89]</ref>. The primary layer consists of rubber springs and the load dependent stiffness characteristics are separated in vertical and horizontal working components. The bogie has passive radial steering technology of the wheelsets. Wheelsets are able to rotate about the vertical axis without any external energy but only by the roll radius difference between the inner and outer wheel. Both wheelsets are connected with cross anchors; mounted on opposite axle boxes. The secondary layer is defined UIC centre of pivot and side bearer (latter guarantees the exchangeability to Y25 bogies). In addition, the centre of pivot has an elastically bearing using a secondary rubber spring. The LEILA bogie prototype was examined during various field tests where it demonstrated its advantages compared to a Y25 bogie. The noise emissions were reduced up to 18 dB(A) compared to a Y25 bogie with cast iron brake blocks and up to 8 dB(A) compared to a Y25 bogie with composite blocks (k- blocks). But the bogie failed at that time to enter the market. During the very good ongoing homologation process the producer of the bogie decided to stop the production of new freight wagons and bogies. Therefore the homologation was stopped and not finished just for commercial reasons. Right now as more and more EMUs are produced with inner bearings it is expected that the acceptability of inner bearing bogies with the advantages less weight and lower forces at the axles in curves will be more acceptable. As with the Leila bogie the cross anchor couples the two axles so that they turn with a phase shift of 180°. This stabilizes the radial steering effect even when the wheel-rail contact is not perfect and the second very important effect is dynamic stabilisation without yaw dampers for high speed straight track running. On curvy track significant flange and running surface wear reduction and also significant reduction of the running resistance occur.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="4.12">The SUSTRAIL Bogie</head><p>The aim of the SUSTRAIL project is to promote modal shift of freight in Europe from road to rail. The SUSTRAIL project intends to provide the approach, structure, and technical content to support this modal shift through improvements in the railway freight system including innovations in rolling stock in track components. The project includes workpackages focused on market research, vehicles, infrastructure and assessment of cost benefits. The work described here is part of workpackage 3: 'The freight vehicle of the future'.</p><p>The main scientific and technological innovations being considered for the SUSTRAIL freight vehicle are:</p><p> The development of advanced vehicle dynamics concepts based on new wheel profiles and improvements in suspension design responding to the needs of a mixed traffic railway;  Developments in the traction and braking systems for high speed low impact freight operation;  Novel designs and materials for lightweight high performance freight wagon body vehicles and bogie structures;  Advanced condition based predictive maintenance tools for critical components of both railway vehicles and the track;  Identification of performance based design principles to move towards the zero maintenance ideal for the vehicle/track system.</p><p>Partners in the project have carried out a technology review to identify the potential innovative technologies to meet the above requirements and the results have been ranked and two concept vehicles are being designed. The 'Conventional' vehicle will use optimised existing technology and a demonstrator for this is being built as part of the project. The 'Futuristic' vehicle will utilise technology which has not yet been proven in the railway field but has potential to make greater improvements.</p><p>Simulations have been carried out of the dynamic behaviour of the concept design vehicles running on typical track in tare, part laden and fully laden cases. In line with the target of a 50% reduction in lateral forces on the track and stable running at 140 km/h a suspension using double Lenoir linkages, longitudinal linkages between axle boxes and centre pivot suspension has been selected. Computer simulation has been used to optimise the suspension and to select suitable parameters for the various components. Assessment of the results is based on:</p><p> Stability: stable running on typical European track at the design speed of 140km/h must be ensured and ride quality (vertical lateral and longitudinal accelerations experienced by the goods transported) will be assessed.  Reduced track forces: track geometrical deterioration (ballast settlement and horizontal level, alignment and buckling), rail surface damage (wear, rolling contact fatigue -RCF) and track components damage (sleeper cracking, rail pad deterioration, rail fatigue, fastening deterioration) will all be assessed.</p><p>A benchmark vehicle has been selected based on a Y25 bogie and flat bed wagon and has been used to allow quantification of the benefits of the new design.</p><p>A number of radical innovations were considered during the technology review stage of the project but it was decided that the use of double Lenoir link primary suspension as in the Y37 series of bogies (figure 44), would be investigated. The double Lenoir link suspension provides much lower longitudinal primary stiffness while still utilising standard components and methods which are well established within the railway industry. this work A model of the SUSTRAIL vehicle was set up with double Lenoir links using the computer simulation tool Gensys and the influence of variations in the suspension parameters on the critical speed of the wagon was simulated. Straight track was used for this simulation and an initial lateral disturbance was introduced followed by ideal track with no irregularities. Axle load is 22.5 t, wheel profile is S1002 and rail profile UIC60 inclined at 1:40. The wheel rail coefficient of friction is set at 0.35. The wagon speed was reduced from an initial 170 km/h and critical speed assumed to have been reached when the track shifting force (∑ ) drops below 2.5 kN. An example is shown in <ref type="figure" target="#fig_3">figure 46</ref>. Further variations were carried out and the effect of the friction coefficient and stiffness within the suspension on the maximum contact force is shown in <ref type="figure" target="#fig_3">figure 49</ref>. <ref type="figure" target="#fig_3">Figure 49</ref>: The effect of friction coefficient and spring stiffness on the contact force It can be seen that the maximum vertical contact forces tends to increase with the damping and with the spring stiffness. In order to improve the running behavior of the SUSTRAIL vehicle it was decided to assess the benefit of linkages provividing longitudinal stiffness between the axleboxes using a radial arm. A radial arm designed by <ref type="bibr">Scheffel [90]</ref> was studied previously in the Infra-Radial project <ref type="bibr" target="#b91">[91]</ref> which aimed to develop a bogie for heavy haul vehicles (axle loads over 25T) with reduced life cycle costs. Tests using the radial arm with four different primary suspension types showed good results with stable running and radially aligned wheelsets in curves. Wear of the wheels was seen to reduce significantly <ref type="bibr" target="#b91">[91]</ref>. In the work reported here simulation was carried out using MEDYNA for the SUSTRAIL vehicle with double Lenoir links and modified radial arms. Simulations have confirmed that the radial arm should provide lateral stiffness between the wheelsets and optimised parameters have been defined. A prototype of the SUSTRAIL freight vehicle is being constructed by REMARUL engineering. In addition to Vertical coils spring stiffness <ref type="bibr">[%]</ref> the innovative suspension described in this paper the SUSTRAIL vehicle will have disk brakes with an electronic control system. The bogie design is shown in <ref type="figure" target="#fig_4">figure 50</ref>. </p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="5">Longitudinal dynamics</head><p>The longitudinal dynamic behaviour of railway vehicles is often neglected as the link to the vehicle track interaction is generally not significant and it has been common to assume that all vehicles of the same type in a train will behave identically. In heavy haul freight applications however where long trains are common the effect of longitudinal dynamics can become significant. In <ref type="bibr" target="#b71">[71]</ref> for example Qi et al model the longitudinal behaviour of a long train including traction and braking and the coupling between vehicles. Belforte et al <ref type="bibr" target="#b93">[93]</ref> also analyse the effects of severe traction and braking forces on longitudinal dynamics.</p><p>There are several areas where longitudinal dynamics can interact with the general vehicle dynamics. These include:</p><p> Wheel unloading on curves due to lateral components of coupler forces;  Wagon body pitch due to coupler impact forces and  Bogie pitch due to coupler impact forces Cole <ref type="bibr" target="#b94">[94]</ref> describes how these effects can be assessed in different cases and McClanachan <ref type="bibr" target="#b95">[95]</ref> and El Sibaie <ref type="bibr" target="#b96">[96]</ref> present results of computer simulations including coupler models.</p><p>Freight vehicles have to provide satisfactory performance at low cost in tare and laden condition on varying track quality. This has resulted in several standard designs including the Y25 and the three-piece bogie. These designs use friction damping proportional to the vehicle mass to provide good dynamic performance at all loading conditions. In recent years vehicle designers have tried to improve on the dynamic performance of freight wagons and the use of computer tools have helped to overcome the compromise between good curving performance and stability at higher speeds. This has resulted in a number of innovative designs with demonstrable performance improvements but it is notable that few of these have yet to make significant impact in the worldwide freight train fleets.</p><p>A key reason for this lack of adoption is probably the innately conservative nature of the railway industry. Of course this often has a sound basis in, for example, the benefit of using standard components which allow effective maintenance of widely dispersed fleets of vehicles but in order to allow the benefits of the innovative techniques and designs summarised in this paper it is time to reconsider the design of freight vehicles. This could allow increases in speed with lower impact on track and environment and a resulting step change in performance of the railway system. One encouraging sign is the establishment in some countries of track access charging which benefits the use of vehicles with 'track friendly' suspension. Together with emerging legislation and growing pressures on system capacity it is likely that the demand for freight vehicles with higher dynamic performance will climb rapidly.</p><p>Rail freight only can contribute in mitigating the environmental impacts of transportation if the knowledge and todays experience for innovative products is used. Some basic thoughts can be found here and in <ref type="bibr" target="#b97">[97]</ref>. Optimising performance through the development of innovative products is to be planned and procured carefully. This paper has demonstrated that freight vehicle designers have innovative designs of running gear and computer simulation tools ready for this challenge.</p></div><figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_0"><head>Figure 1 :</head><label>1</label><figDesc>Figure 1: Freight wagon from Kockums Sweden, built in 1882 [4].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_1"><head>Figure 2 :</head><label>2</label><figDesc>Figure 2: UIC double link suspension.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_2"><head>Figure 3 : Double link suspension [ 8 ]. Parts of double link (a), assembled double link (b) and mounted double link (c).</head><label>38</label><figDesc>Figure 3: Double link suspension [8]. Parts of double link (a), assembled double link (b) and mounted double link (c).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_3"><head>Figure 4 :</head><label>4</label><figDesc>Figure 4:. DB bogie Type 931 [7].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_4"><head>Figure 5 :</head><label>5</label><figDesc>Figure 5: DB bogie Type 665 [7].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_5"><head>Figure 7 : Model 18- 100 bogie: a - general view, b - central suspension scheme, c - primary 'suspension' scheme ( 1 - wheelset; 2 - side frame; 3 - bolster; 4 - braking leverage; 5 - central pivot; 6 - rigid side bearings; 7 - suspension springs; 8 - friction wedge; 9 -</head><label>7100123456789</label><figDesc>Figure 7: Model 18-100 bogie: a-general view, b-central suspension scheme, cprimary 'suspension' scheme (1-wheelset; 2-side frame; 3-bolster; 4-braking leverage; 5-central pivot; 6-rigid side bearings; 7-suspension springs; 8friction wedge; 9-axle-box)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_7"><head>Figure 8 : Force-displacement curve of Coulomb friction model (left) and Coulomb model with spring in series as in [ 29 ]</head><label>829</label><figDesc>Figure 8: Force-displacement curve of Coulomb friction model (left) and Coulomb model with spring in series as in [29] (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_8"><head>Figure 9 : Friction element with spring in series.</head><label>9</label><figDesc>Figure 9: Friction element with spring in series.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_10"><head>Figure 10 : Typical force-displacement diagram of leaf spring/link suspension. Example of curve for small displacements around static equilibrium.</head><label>10</label><figDesc>Figure 10: Typical force-displacement diagram of leaf spring/link suspension. Example of curve for small displacements around static equilibrium.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_11"><head>Figure 11 : Model for leaf spring or link suspension as used for example by KTH [ 40 ]. See figure 10 for definition of k1 and k2.</head><label>114010</label><figDesc>Figure 11: Model for leaf spring or link suspension as used for example by KTH [40]. See figure 10 for definition of k1 and k2.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_12"><head>Figure 12 Model for bumpstop element (∆ - clearance, - stiffness of the bumpstop)</head><label>12</label><figDesc>Figure 12 Model for bumpstop element (∆-clearance,-stiffness of the bumpstop)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_13"><head>Figure 13 Model for center plate element (∆ - distance between center plate edge and car body center of gravity, - roll angle, - weight of the car body per one center plate, - roll torque, - equivalent roll stiffness)</head><label>13</label><figDesc>Figure 13 Model for center plate element (∆-distance between center plate edge and car body center of gravity,-roll angle,-weight of the car body per one center plate,-roll torque,-equivalent roll stiffness)</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_14"><head>Figure 15 : Procedure to find the non-linear critical speed [ 60 ].</head><label>1560</label><figDesc>Figure 15: Procedure to find the non-linear critical speed [60].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_15"><head>Figure 16 : Bifurcation diagram for a loaded two-axle vehicle with link suspension ( 21 t axle load) Wheel: somewhat worn S1002. Rail: Nominal UIC60 [ 42 ].</head><label>162142</label><figDesc>Figure 16: Bifurcation diagram for a loaded two-axle vehicle with link suspension (21 t axle load) Wheel: somewhat worn S1002. Rail: Nominal UIC60 [42].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_16"><head>Figure 17 .</head><label>17</label><figDesc>shows attractors for two different types of freight wagons. The results are in principle quite similar to those in figure 16.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_17"><head>Figure 17 : Attractors for the Hbbills 311 and the G69 freight wagons. The model with the measured characteristics of the UIC links is damping less than the model with the cylindrical characteristics. The hunting attractor exists even for low speeds [61].</head><label>17</label><figDesc>Figure 17: Attractors for the Hbbills 311 and the G69 freight wagons. The model with the measured characteristics of the UIC links is damping less than the model with the cylindrical characteristics. The hunting attractor exists even for low speeds [61].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_18"><head>Figure 18 Map of lateral oscillation amplitude in single wagon as function of curve radius [62].</head><label>18</label><figDesc>Figure 18 Map of lateral oscillation amplitude in single wagon as function of curve radius [62].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_19"><head>Figure 19 : Energy dissipation. Comparative simulation with and without track irregularities. Two-axle vehicle with link suspension. 22 . 5 t axle load [ 42 ].</head><label>1922542</label><figDesc>Figure 19: Energy dissipation. Comparative simulation with and without track irregularities. Two-axle vehicle with link suspension. 22.5 t axle load [42].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_20"><head>Figure 20 : Angle of attack (a) and wear number (b) for wagons in a curve of 200 m radius at 60 km/h with 18- 100 bogies respectively bogies with radial arm upgrade [ 15 ].</head><label>2010015</label><figDesc>Figure 20: Angle of attack (a) and wear number (b) for wagons in a curve of 200 m radius at 60 km/h with 18-100 bogies respectively bogies with radial arm upgrade [15].</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_21"><head>Figure 21 : Y25 bogie running in a 300m curve Wheel slip lateral and longitudinal at all wheel rail contact points, 90 t tank car with a Y25-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0, 67 m/s², s1002 Wheel profile, UIC 60E1, 1 Figure 22 : Radially steered bogie running in a 300 m curve Wheel slip lateral and longitudinal at all wheel- rail contact points, 90 t tank car with a Leila-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0, 67 m/s², s1002 Wheel profile, UIC 60E1, 1</head><label>21903006712230090300671</label><figDesc>Figure 21: Y25 bogie running in a 300m curve Wheel slip lateral and longitudinal at all wheel rail contact points, 90 t tank car with a Y25-Bogie in a 300 m curve, speed 80 km/h, lateral acceleration aq= 0,67 m/s², s1002 Wheel profile, UIC 60E1, 1:40 rail inclination</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_22"><head>Figure 23 : Calculated RCF positions of the wheel with corresponding average wear number. The far-left line is also reported as the observed approximate location for RCF initiation.</head><label>23</label><figDesc>Figure 23: Calculated RCF positions of the wheel with corresponding average wear number. The far-left line is also reported as the observed approximate location for RCF initiation.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_23"><head>Figure 24 :</head><label>24</label><figDesc>Figure 24: The HSFV.1 experimental freight wagon</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_24"><head>Figure 25 :</head><label>25</label><figDesc>Figure 25: Unitruck running gear (left) and modifications for improving curving behaviour (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_25"><head>Figure 28 :</head><label>28</label><figDesc>Figure 28: The TF25 bogie</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_26"><head>Figure 29 : The Gigabox bogie Figure 30 :</head><label>2930</label><figDesc>Figure 29: The Gigabox bogie</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_27"><head>Figure 31 : The DRRS bogie and cross section</head><label>31</label><figDesc>Figure 31: The DRRS bogie and cross section</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_28"><head>Figure 32 :</head><label>32</label><figDesc>Figure 32: Split wedge (left) and spatial wedge (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_30"><head>Figure 33 : Adapter Plus ® by Amsted (left) and layered shear pad in Russian 18- 9800 bogie (right).</head><label>339800</label><figDesc>Figure 33: Adapter Plus ® by Amsted (left) and layered shear pad in Russian 189800 bogie (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_31"><head>Figure 34 : Constant contact side bearing with springs (left) and with non-metal element and roller (right).</head><label>34</label><figDesc>Figure 34: Constant contact side bearing with springs (left) and with non-metal element and roller (right).</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_32"><head>Figure 37 :</head><label>37</label><figDesc>Figure 37: The Lenoir pusher spring</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_33"><head>Figure 38 : RC25NT bogie with direct inter-axle linkages Figure 39 : Simulation stability results for RC25NT bogie vs. Y25 bogie (upper figure = high conicity, lower figure = low conicity)Figure 40 : Simulated wear number for RC25NT bogie vs. Y25 bogie</head><label>383940</label><figDesc>Figure 38: RC25NT bogie with direct inter-axle linkages</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_34"><head>Figure 41</head><label>41</label><figDesc>Figure 41 and 42 show the main components of this bogie. Compared to the standard bogies such as Y25, the LEILA bogie has inner bearings. The resulting better force flow lead to a weight reduction of the bogie frame and wheelset resulting in an overall weight reduction of 750 kg per bogie compared to Y25 bogie. At the web of the wheels (diameter: 920 mm), disc brakes are mounted. The primary layer consists of rubber springs and the load dependent stiffness characteristics are separated in vertical and horizontal working components. The bogie has passive radial steering technology of the wheelsets. Wheelsets are able to rotate about the vertical axis without any external energy but only by the roll radius difference between the inner and outer wheel. Both wheelsets are connected with cross anchors; mounted on opposite axle boxes. The secondary layer is defined UIC centre of pivot and side bearer (latter guarantees the exchangeability to Y25 bogies). In addition, the centre of pivot has an elastically bearing using a secondary rubber spring. The LEILA bogie prototype was examined during various field tests where it demonstrated its advantages compared to a Y25 bogie. The noise emissions were reduced up to 18 dB(A) compared to a Y25 bogie with cast iron brake blocks and up to 8 dB(A) compared to a Y25 bogie with composite blocks (kblocks). But the bogie failed at that time to enter the market. During the very good ongoing homologation process the producer of the bogie decided to stop the production of new freight wagons and bogies. Therefore the homologation was stopped and not finished just for commercial reasons. Right now as more and more EMUs are produced with inner bearings it is expected that the acceptability of inner bearing bogies with the advantages less weight and lower forces at the axles in curves will be more acceptable.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_35"><head>Figure 41 : Main components of LEILA bogie [ 88 ] Figure 42 : Leila Bogie from beneath with the inner bearings, cross anchor and wheel disc brakes clearly visible 4 .</head><label>4188424</label><figDesc>Figure 41: Main components of LEILA bogie [88]</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_36"><head>Figure 43 : TVP2007 bogie by Tatravagónka a.s.</head><label>43</label><figDesc>Figure 43: TVP2007 bogie by Tatravagónka a.s.</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_37"><head>Figure 44 :</head><label>44</label><figDesc>Figure 44: A suspension with double Lenoir links</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_38"><head>Figure 46 : A sample simulation results showing the establishment of the critical speed for the SUSTRAIL vehicle with double Lenoir links</head><label>46</label><figDesc>Figure 46: A sample simulation results showing the establishment of the critical speed for the SUSTRAIL vehicle with double Lenoir links</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_39"><head>Figure 47 :</head><label>47</label><figDesc>Figure 47: The effect of Lenoir link angle, length and friction coefficient on the critical speed of the SUSTRAIL vehicle</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_40"><head>Figure 48 : Maximum vertical force on the rail for the SUSTRAIL vehicle running at 120 km/h</head><label>48</label><figDesc>Figure 48: Maximum vertical force on the rail for the SUSTRAIL vehicle running at 120 km/h</figDesc></figure>
+<figure xmlns="http://www.tei-c.org/ns/1.0" xml:id="fig_41"><head>Figure 50 :</head><label>50</label><figDesc>Figure 50: The prototype SUSTRAIL freight bogie</figDesc></figure>
+
+ <note place="foot" n="1">-side frame; 2-bolster; 3-wheelset; 4-primary suspension; 5-elastic connection between sub-frames Figure 36: Scheffel HS bogie (left) and bogie retrofitted with Radial Arm design (right).</note>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">European freight vehicle running gear: today&apos;s position and future demands</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">PartF, Journal of Rail and Rapid Transit</title>
+ <imprint>
+ <biblScope unit="volume">215</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+ <note>Proc. Of the Inst. Of Mech. Engrs.</note>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">White Paper-Roadmap to a Single European Transport Area-Towards a competitive and resource efficient transport system</title>
+ </analytic>
+ <monogr>
+ <title level="j">European Commission</title>
+ <imprint>
+ <biblScope unit="volume">144</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>COM</note>
+</biblStruct>
+
+<biblStruct xml:id="b2">
+ <monogr>
+ <title level="m" type="main">Freight Wagon Running Gear-a review, KTH Railway Division</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2002" />
+ <pubPlace>Stockholm</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b3">
+ <monogr>
+ <title level="m">Swedish: Järnvägsfordon från Kockums), Kockums industrier</title>
+ <meeting><address><addrLine>Malmö, Sweden, Pamphlet</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1995" />
+ </imprint>
+ </monogr>
+ <note>Railway vehicles from Kockums</note>
+</biblStruct>
+
+<biblStruct xml:id="b4">
+ <monogr>
+ <title level="m">UIC Code 517. Wagons-Suspension gear (Standardisation)</title>
+ <imprint/>
+ </monogr>
+ <note>6th edition 1-7-79. Reprint 1-1-89. incorporating 8 amendments</note>
+</biblStruct>
+
+<biblStruct xml:id="b5">
+ <monogr>
+ <title level="m" type="main">Laufwerkskonstruktion und Erhöhung der Radsatzlasten im Güterverkehr. ZEV-Glasers Annalen 107</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">T</forename><surname>Madeyski</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1983" />
+ <biblScope unit="page" from="139" to="147" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b6">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><surname>Müller</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Niedermeyer</surname></persName>
+ </author>
+ <title level="m">Weiterentwickelte Güterwagendrehgestelle der Deutschen Bundesbahn für 22.5 t Radsatzlast-wieder nach dem Lenkachsenprinzip. ZEV-Glasers Annalen</title>
+ <imprint>
+ <date type="published" when="1987" />
+ <biblScope unit="volume">111</biblScope>
+ <biblScope unit="page" from="188" to="196" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b7">
+ <analytic>
+ <title level="a" type="main">Dynamic analysis of a freight car with standard UIC single-axle running gear</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Lange</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH Railway Technology</title>
+ <imprint>
+ <biblScope unit="page">34</biblScope>
+ <date type="published" when="1996" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b8">
+ <monogr>
+ <title level="m" type="main">Running behavior of railway freight wagon with single axle running gear</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <idno>1998:40</idno>
+ <imprint>
+ <date type="published" when="1998" />
+ <publisher>KTH</publisher>
+ </imprint>
+ <respStmt>
+ <orgName>Division of Railway Technology</orgName>
+ </respStmt>
+ </monogr>
+<note type="report_type">TRITA-FKT Report</note>
+</biblStruct>
+
+<biblStruct xml:id="b9">
+ <monogr>
+ <title level="m" type="main">How to improve the running behavior of freight wagons with UIC-link suspension. Vehicle System Dynamics Supplement 33</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ <biblScope unit="page" from="394" to="405" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b10">
+ <monogr>
+ <title level="m" type="main">Running behavior of freight wagons with link bogies. TRITA-FKT Report 1999:12, Division of Railway Technology</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ <publisher>KTH</publisher>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b11">
+ <monogr>
+ <title level="m" type="main">Neue Erkenntnisse über das Verschleissverhalten von Güterwagendrehgestellen, ZEV Glasers Annalen 111</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Specht</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1987" />
+ <biblScope unit="page" from="271" to="280" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b12">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section D. Trucks and truck details</title>
+ <imprint>
+ <date type="published" when="2010" />
+ <biblScope unit="volume">130</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b13">
+ <analytic>
+ <title level="a" type="main">Bogies two-axle three-piece for freight wagons of 1520 mm gauge railways. General technical specifications</title>
+ </analytic>
+ <monogr>
+ <title level="m">GOST 9246-2013</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b14">
+ <analytic>
+ <title level="a" type="main">Refining the wedge friction damper of three-piece freight bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Romen</forename><surname>Yu</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="445" to="455" />
+ <date type="published" when="2008" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b15">
+ <analytic>
+ <title level="a" type="main">Influence of construction schemes and parameters of three-piece freight bogies on wagon stability, ride and curving qualities</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><surname>Boronenko</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Rudakova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="page" from="402" to="414" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+ <note>Supplement</note>
+</biblStruct>
+
+<biblStruct xml:id="b16">
+ <analytic>
+ <title level="a" type="main">Identification of parameters for spatial wedge system implemented in freight bogie design</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 10th mini-conference on Vehicle System Dynamics, Identification and Anomalies. Ed. I. Zobory. ISBN 978 963 420 968</title>
+ <meeting>the 10th mini-conference on Vehicle System Dynamics, Identification and Anomalies. Ed. I. Zobory. ISBN 978 963 420 968<address><addrLine>Budapest</addrLine></address></meeting>
+ <imprint>
+ <publisher>Komaromi Nyomda es Kiado Kft</publisher>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="page" from="245" to="252" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b17">
+ <analytic>
+ <title level="a" type="main">A review of modeling methods for railway vehicle suspension components. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><forename type="middle">M</forename><surname>Eickhoff</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">R</forename><surname>Evans</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">J</forename><surname>Minnis</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">24</biblScope>
+ <biblScope unit="page" from="469" to="496" />
+ <date type="published" when="1995" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b18">
+ <analytic>
+ <title level="a" type="main">Modelling of suspension components in a rail vehicle dynamics context</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><surname>Bruni S</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Vinolas</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Stichel</forename><forename type="middle">S</forename></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">49</biblScope>
+ <biblScope unit="issue">7</biblScope>
+ <biblScope unit="page" from="1021" to="1072" />
+ <date type="published" when="2011" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b19">
+ <analytic>
+ <title level="a" type="main">Modeling and Simulation of Freight Wagon with Special attention to the Prediction of Track Damage</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Casanueva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Hossein</forename><surname>Nia</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Railway Technology</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b20">
+ <analytic>
+ <title level="a" type="main">Modelling friction wedges, Part I: The state-of-the-art</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">E</forename><surname>Klauser</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of IMECE04 2004 ASME International Mechanical Engineering Congress &amp; Exposition</title>
+ <meeting>IMECE04 2004 ASME International Mechanical Engineering Congress &amp; Exposition<address><addrLine>Anaheim (CA</addrLine></address></meeting>
+ <imprint>
+ <publisher>American Society of Mechanical Engineering</publisher>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b21">
+ <analytic>
+ <title level="a" type="main">A review of dynamics modelling of friction wedge suspensions. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Q</forename><surname>Wu</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Spiryagin</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Q</forename><surname>Sun</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="issue">11</biblScope>
+ <biblScope unit="page" from="1389" to="1415" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b22">
+ <monogr>
+ <title level="m" type="main">Non-smooth Problems in Vehicle Systems Dynamics</title>
+ <imprint>
+ <date type="published" when="2010" />
+ <publisher>Springer</publisher>
+ <pubPlace>Berlin Heidelberg</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b23">
+ <analytic>
+ <title level="a" type="main">Rail Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Anderson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH Royal Institute of Technology</title>
+ <imprint>
+ <date type="published" when="2013" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b24">
+ <monogr>
+ <title level="m" type="main">Simulation. In: Iwnicki, editor, Handbook of Railway Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Berg</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Iwnicki</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <publisher>Taylor &amp; Francis</publisher>
+ <biblScope unit="page" from="359" to="421" />
+ <pubPlace>London</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b25">
+ <analytic>
+ <title level="a" type="main">Modelling of wedge dampers in the presence of two-dimensional dry friction</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Xia</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Vehicle system dynamics</title>
+ <meeting><address><addrLine>Lingby, Denmark</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2003" />
+ <biblScope unit="volume">37</biblScope>
+ <biblScope unit="page" from="565" to="578" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b26">
+ <analytic>
+ <title level="a" type="main">Modeling and Dynamics of Friction Wedge Dampers in Railroad Freight Trucks</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">B</forename><surname>Kaiser</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="55" to="82" />
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b27">
+ <analytic>
+ <title level="a" type="main">Multibody simulation of a freight bogie with friction dampers</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">N</forename><surname>Bosso</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Gugliotta</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Soma</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Railroad Conference</title>
+ <imprint>
+ <publisher>ASME/IEEE Joint</publisher>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="47" to="56" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b28">
+ <analytic>
+ <title level="a" type="main">Model of the UIC link suspension for freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Archive of Applied Mechanics</title>
+ <imprint>
+ <biblScope unit="volume">73</biblScope>
+ <biblScope unit="issue">7</biblScope>
+ <biblScope unit="page" from="517" to="532" />
+ <date type="published" when="2003-12" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b29">
+ <monogr>
+ <title level="m" type="main">Smoothing dry friction damping by dither generated in rolling contact of wheel and rail and its influence on ride dynamics of freight wagons, NVSD</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2010-06" />
+ <biblScope unit="volume">48</biblScope>
+ <biblScope unit="page" from="675" to="703" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b30">
+ <analytic>
+ <title level="a" type="main">Equivalent viscous damping models of coulomb friction in multi-degree-of-freedom vibration systems</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">X</forename><surname>Tan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">J</forename><surname>Rogers</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Sound and Vibration</title>
+ <imprint>
+ <biblScope unit="volume">185</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="33" to="50" />
+ <date type="published" when="1995-08" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b31">
+ <analytic>
+ <title level="a" type="main">Modelling of a two-dimensional Coulomb friction oscillator</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Xia</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Sound and Vibration</title>
+ <imprint>
+ <biblScope unit="volume">265</biblScope>
+ <biblScope unit="issue">5</biblScope>
+ <biblScope unit="page" from="1063" to="1074" />
+ <date type="published" when="2003-08" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b32">
+ <analytic>
+ <title level="a" type="main">A substitute model of two-dimensional dry friction exposed to dither generated by rolling contact of wheel and rail</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">50</biblScope>
+ <biblScope unit="issue">10</biblScope>
+ <biblScope unit="page" from="1495" to="1514" />
+ <date type="published" when="2012" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b33">
+ <monogr>
+ <title level="m" type="main">The dynamics of a railway freight wagon wheelset with dry friction damping Vehicle System Dynamics 44 supplement</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Asmund</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="853" to="861" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b34">
+ <analytic>
+ <title level="a" type="main">Flexibility of trapezoidal springs</title>
+ </analytic>
+ <monogr>
+ <title level="j">ORE</title>
+ <imprint>
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b35">
+ <monogr>
+ <title level="m" type="main">ORE, Parabolic springs for wagons (design, calculation, treatment)</title>
+ <imprint>
+ <date type="published" when="1988" />
+ <pubPlace>Utrecht, 43</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b36">
+ <analytic>
+ <title level="a" type="main">Improvement of the running stability of existing RIV wagons required to run under any loading conditions at speeds of 80 km/h</title>
+ </analytic>
+ <monogr>
+ <title level="j">ORE</title>
+ <imprint>
+ <date type="published" when="1967" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b37">
+ <monogr>
+ <title level="m" type="main">ORE: Etude de la stabilité transversale d&apos;un véhicule ferroviaire à deux essieux</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><forename type="middle">R</forename><surname>Joly</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1974" />
+ <pubPlace>Utrecht</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b38">
+ <monogr>
+ <title level="m" type="main">Computer simulation of freight vehicles with leaf springs</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">B</forename><surname>Ayasse</surname></persName>
+ </author>
+ <idno>INRETS/RE- 01-046-FR</idno>
+ <imprint>
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+<note type="report_type">Technical report</note>
+ <note>a comparison between different packages, INRETS</note>
+</biblStruct>
+
+<biblStruct xml:id="b39">
+ <analytic>
+ <title level="a" type="main">Modelling and laboratory investigations on freight wagon link suspensions with respect to vehicle-track dynamic interaction</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">KTH</title>
+ <imprint>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+<note type="report_type">Licenciate Thesis</note>
+</biblStruct>
+
+<biblStruct xml:id="b40">
+ <analytic>
+ <title level="a" type="main">Experimental and theoretical analysis of freight wagon link suspension</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Andersson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">220</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="361" to="372" />
+ <date type="published" when="2006-01" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b41">
+ <analytic>
+ <title level="a" type="main">Influence of link suspension characteristics variation on two-axle freight wagon dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Andersson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">NVSD</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="415" to="423" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b42">
+ <monogr>
+ <title level="m" type="main">New simulation model for freight wagons with UIC link suspension, VSD</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Persson</forename><forename type="middle">I</forename></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="695" to="704" />
+ </imprint>
+ </monogr>
+ <note>Suppl. 1</note>
+</biblStruct>
+
+<biblStruct xml:id="b43">
+ <monogr>
+ <title level="m" type="main">Dynamics of European two-axle freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hoffmann</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <pubPlace>Kongens Lyngby, Denmark</pubPlace>
+ </imprint>
+ <respStmt>
+ <orgName>Technical University of Denmark</orgName>
+ </respStmt>
+ </monogr>
+<note type="report_type">Ph.D. Thesis</note>
+</biblStruct>
+
+<biblStruct xml:id="b44">
+ <monogr>
+ <title level="m" type="main">Freight wagon running gears with leaf spring and ring suspension, presented at the SIMPACK user group meeting</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Stiepel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Zeipel</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b45">
+ <analytic>
+ <title level="a" type="main">Simulation of the Response of Leaf Springs to Broad Band Random Excitation</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Cebon</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">15</biblScope>
+ <biblScope unit="issue">6</biblScope>
+ <biblScope unit="page" from="375" to="390" />
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b46">
+ <monogr>
+ <title level="m" type="main">Measurement and Representation of the Mechanical Properties of Truck Leaf Springs</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">S</forename><surname>Fancher</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Ervin</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><forename type="middle">C</forename><surname>Macadam</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Winkler</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1980-08" />
+ <publisher>SAE International</publisher>
+ <pubPlace>Warrendale, PA</pubPlace>
+ </imprint>
+ </monogr>
+ <note>SAE Technical Paper 800905</note>
+</biblStruct>
+
+<biblStruct xml:id="b47">
+ <analytic>
+ <title level="a" type="main">On Application of the Rolling Contact Theory for Modelling of the UIC Link Suspension for Freight Wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Zeszyty Naukowe Instytutu Pojazdów</title>
+ <imprint>
+ <biblScope unit="volume">3</biblScope>
+ <biblScope unit="issue">50</biblScope>
+ <biblScope unit="page" from="5" to="14" />
+ <date type="published" when="2003" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b48">
+ <analytic>
+ <title level="a" type="main">A new mathematical model of the behaviour of a four-axle freight wagon with UIC single-link suspension</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Matei</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1177/0954409711398173</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">225</biblScope>
+ <biblScope unit="page">637</biblScope>
+ <date type="published" when="2011" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b49">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">V</forename><surname>Vershinsky</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">N</forename><surname>Danilov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">N</forename><surname>Chelnokov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><forename type="middle">I</forename></persName>
+ </author>
+ <title level="m">Wagon dynamics. Мoscow, Transport</title>
+ <imprint>
+ <date type="published" when="1972" />
+ <biblScope unit="volume">304</biblScope>
+ </imprint>
+ </monogr>
+ <note>in Russian</note>
+</biblStruct>
+
+<biblStruct xml:id="b50">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><surname>Ballew</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><forename type="middle">J</forename><surname>Chan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Sandu</surname></persName>
+ </author>
+ <title level="m">Multibody dynamics modelling of the freight train bogie system Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">49</biblScope>
+ <biblScope unit="page" from="2011" to="501" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b51">
+ <analytic>
+ <title level="a" type="main">Modelling friction wedges, Part II: An improved model. Proceedings of IMECE04</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">E</forename><surname>Klauser</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">ASME International Mechanical Engineering Congress &amp; Exposition</title>
+ <imprint>
+ <date type="published" when="2004-11-13" />
+ <publisher>American Society of Mechanical Engineering</publisher>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b52">
+ <analytic>
+ <title level="a" type="main">Freight car models and their computer-aided dynamic analysis</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Kovalev</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">N</forename><surname>Lysikov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Mikheev</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Pogorelov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><surname>Simonov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><surname>Yazykov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Zakharov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><surname>Zharov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">I</forename><surname>Goryacheva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Soshenkov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Torskaya</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Multibody System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="399" to="423" />
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b53">
+ <analytic>
+ <title level="a" type="main">On Calculation of Jacobian Matrices in Simulation of Multibody Systems</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Pogorelov</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Preprints of the NATO Advanced Study Institute on Virtual Nonlinear Multibody Systems</title>
+ <editor>Schiehlen and Valasek</editor>
+ <meeting><address><addrLine>Prague</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="159" to="164" />
+ </imprint>
+ <respStmt>
+ <orgName>Czech Technical University in Prague</orgName>
+ </respStmt>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b54">
+ <analytic>
+ <title level="a" type="main">Possibility of jamming and wedging in the three-piece trucks of a moving freight car</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">D</forename><surname>Mckisic</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">V</forename><forename type="middle">F</forename><surname>Ushkalov</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Zhechev</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">45</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="61" to="67" />
+ <date type="published" when="2007" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b55">
+ <analytic>
+ <title level="a" type="main">Dynamic models of friction wedge dampers</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 1997 IEEE/ASME Joint Rail Conference</title>
+ <meeting>the 1997 IEEE/ASME Joint Rail Conference<address><addrLine>Boston, MA</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1920-03-18" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b56">
+ <analytic>
+ <title level="a" type="main">Modelling Freight Wagon Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Mcclanachan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><surname>Handoko</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Dhanasekar</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Skerman</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Davey</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics Supplement</title>
+ <imprint>
+ <biblScope unit="volume">41</biblScope>
+ <biblScope unit="page" from="438" to="447" />
+ <date type="published" when="2004" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b57">
+ <monogr>
+ <title level="m" type="main">Modeling and dynamics of friction wedge dampers in railroad freight trucks Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">B</forename><surname>Kaiser</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">P</forename><surname>Cusumano</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><forename type="middle">F</forename><surname>Gardner</surname></persName>
+ </author>
+ <imprint>
+ <biblScope unit="volume">38</biblScope>
+ <biblScope unit="page" from="2002" to="55" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b58">
+ <analytic>
+ <title level="a" type="main">Chaos in a railway bogie</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><forename type="middle">H</forename><surname>Kaas-Petersen</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Acta Mechanica</title>
+ <imprint>
+ <biblScope unit="volume">61</biblScope>
+ <biblScope unit="page" from="89" to="107" />
+ <date type="published" when="1986" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b59">
+ <analytic>
+ <title level="a" type="main">On non-linear methods of bogie stability assessment using computer simulations</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">O</forename><surname>Polach</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">220</biblScope>
+ <biblScope unit="page" from="13" to="27" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b60">
+ <analytic>
+ <title level="a" type="main">Limit cycle behaviour and chaotic motions of two-axle freight wagons with friction damping</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Multibody System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">8</biblScope>
+ <biblScope unit="issue">3</biblScope>
+ <biblScope unit="page" from="243" to="255" />
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b61">
+ <analytic>
+ <title level="a" type="main">Dynamics of two-axle railway freight wagons with UIC standard suspension, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hoffmann</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110600869594</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">44</biblScope>
+ <biblScope unit="page" from="1" to="139" />
+ <date type="published" when="2006" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b62">
+ <analytic>
+ <title level="a" type="main">Analysis of the nonlinear dynamics of a 2axle freight wagon in curves, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Di</forename><surname>Gialleonardo</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Bruni</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>True</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423114.2013.863363</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="issue">1</biblScope>
+ <biblScope unit="page" from="125" to="141" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b63">
+ <analytic>
+ <title level="a" type="main">Lateral hunting stability of railway vehicles running on elastic track structures</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><forename type="middle">M</forename><surname>Zhai</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">K</forename><forename type="middle">Y</forename><surname>Wang</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Journal of Computational and Nonlinear Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">5</biblScope>
+ <biblScope unit="issue">4</biblScope>
+ <biblScope unit="page" from="41009" to="41010" />
+ <date type="published" when="2010" />
+ </imprint>
+ </monogr>
+ <note>ASME</note>
+</biblStruct>
+
+<biblStruct xml:id="b64">
+ <analytic>
+ <title level="a" type="main">Freight car curving performance in braked conditions</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Berghuvud</surname></persName>
+ </author>
+ <idno type="doi">23.DOI:10.1243/0954409021531656</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers, Part F: Journal of Rail and Rapid Transit</title>
+ <imprint>
+ <biblScope unit="volume">216</biblScope>
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b65">
+ <monogr>
+ <title/>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><surname>Hecht</surname></persName>
+ </author>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b66">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Keudel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename></persName>
+ </author>
+ <title level="m">Verbesserte Energieeffizienz durch radialeinstellendes Fahrwerk, Eisenbahningenieur 05</title>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="42" to="47" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b67">
+ <analytic>
+ <title level="a" type="main">Wheel/rail interface management in heavy haul railway operations-applying science and technology, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Fröhling</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110701413797</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">45</biblScope>
+ <biblScope unit="issue">7-8</biblScope>
+ <biblScope unit="page" from="649" to="677" />
+ <date type="published" when="2007" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b68">
+ <analytic>
+ <title level="a" type="main">Minimising wheel wear by optimising the primary suspension stiffness and centre plate friction of selfsteering bogies, Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">N</forename><surname>Fergusson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">D</forename><surname>Fröhling</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Klopper</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423110801993094</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="issue">S1</biblScope>
+ <biblScope unit="page" from="457" to="468" />
+ <date type="published" when="2008" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b69">
+ <analytic>
+ <title level="a" type="main">Influence of switches and crossings on wheel profile evolution in freight vehicles. Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Casanueva</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Doulgerakis</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Stichel</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1080/00423114.2014.898779</idno>
+ </analytic>
+ <monogr>
+ <title level="j">International Journal of Vehicle Mechanics and Mobility</title>
+ <imprint>
+ <biblScope unit="volume">52</biblScope>
+ <biblScope unit="page" from="317" to="337" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b70">
+ <analytic>
+ <title level="a" type="main">A parametric study of the effects of freight vehicles on rolling contact fatigue of rail</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Tunna</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Urban</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1243/09544097JRRT228</idno>
+ </analytic>
+ <monogr>
+ <title level="j">Proceedings of the Institution of Mechanical Engineers</title>
+ <imprint>
+ <biblScope unit="volume">223</biblScope>
+ <biblScope unit="page">141</biblScope>
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+ <note>Part F: Journal of Rail and Rapid Transit</note>
+</biblStruct>
+
+<biblStruct xml:id="b71">
+ <analytic>
+ <title level="a" type="main">Whole life rail model application and development for RSSBdevelopment of an RCF damage parameter</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Burstow</surname></persName>
+ </author>
+ <idno>AEATR-ES-2003-832</idno>
+ <ptr target="http://www.rssb.co.uk" />
+ </analytic>
+ <monogr>
+ <title level="j">Rail Safety &amp; Standards Board</title>
+ <imprint>
+ <biblScope unit="volume">1</biblScope>
+ <date type="published" when="2003-10" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b72">
+ <analytic>
+ <title level="a" type="main">Wheel damage on the Swedish iron ore line investigated via multibody simulation</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Hossein</forename><surname>Nia</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Jönsson</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P.-A</forename><surname>Stichel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename></persName>
+ </author>
+ <idno type="doi">228:652.DOI:10.1177/0954409714523264</idno>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the Institution of Mechanical Engineers</title>
+ <meeting>the Institution of Mechanical Engineers</meeting>
+ <imprint>
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b73">
+ <analytic>
+ <title level="a" type="main">The dynamic effects of conventional freight car running over a dipped joint</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><forename type="middle">V</forename><surname>Dukkipati</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Dong</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">31</biblScope>
+ <biblScope unit="page" from="95" to="111" />
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b74">
+ <monogr>
+ <title level="m" type="main">A test rig for measuring three piece bogie dynamic parameters applied to freight car application Vehicle System Dynamics 44 supplement</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><forename type="middle">H</forename><surname>Ren</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Shen</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Y</forename><forename type="middle">S</forename><surname>Hu</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2006" />
+ <biblScope unit="page" from="853" to="861" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b75">
+ <monogr>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">H</forename><surname>Wickens</surname></persName>
+ </author>
+ <title level="m">The dynamics of railway vehicles on straight track-fundamental considerations of lateral stability Proceedings of the Institution of Mechanical Engineers Part</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="3" to="1965" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b76">
+ <monogr>
+ <title level="m" type="main">Suspension design for high performance two-axle freight vehicles Proceedings of the Institution of Mechanical Engineers Part</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">H</forename><surname>Wickens</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">O</forename><surname>Gilchrist</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">E W</forename><surname>Hobbs</surname></persName>
+ </author>
+ <idno>3D 1969-70</idno>
+ <imprint>
+ <biblScope unit="volume">184</biblScope>
+ <biblScope unit="page" from="22" to="36" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b77">
+ <monogr>
+ <title level="m" type="main">Tracking truck</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><forename type="middle">B</forename><surname>Webber</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1968-07" />
+ <biblScope unit="volume">339466230</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b78">
+ <monogr>
+ <title level="m" type="main">Uklad zawieszenia pojazdu kolejowego, zwlaszcza dwuosiowego wagonu towarowego</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Greenbrier</forename><surname>Europe</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Wagony</forename><surname>Swidnica</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><forename type="middle">A</forename></persName>
+ </author>
+ <imprint/>
+ </monogr>
+ <note>PL 207920 B1 B61F 5/30 (2006.01</note>
+</biblStruct>
+
+<biblStruct xml:id="b79">
+ <monogr>
+ <title level="m" type="main">Advances in Rail Wagon Design&apos; Proceedings of the Institution of Mechanical Engineers, Part F: Journal of Rail and Rapid Transit</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Etwell</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1990-01" />
+ <biblScope unit="volume">204</biblScope>
+ <biblScope unit="page" from="45" to="54" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b80">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section C-part II. Design, fabrication, and construction of freight cars</title>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b81">
+ <monogr>
+ <title level="m" type="main">Chapter 11: Service worthiness tests and analyses for new freight cars</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M-1001</forename></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2007" />
+ <biblScope unit="volume">374</biblScope>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b82">
+ <monogr>
+ <title level="m" type="main">Association of American Railroads. Manual of standards and recommended practices. Section D. Trucks and truck details Specification M-976 Truck performance for rail cars</title>
+ <imprint>
+ <date type="published" when="2002" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b83">
+ <analytic>
+ <title level="a" type="main">Comparison of different types of friction wedge suspensions in freight wagons</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><surname>Orlova</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">E</forename><surname>Rudakova</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 8-th International Conference on Railway Bogies and Running Gears</title>
+ <meeting>the 8-th International Conference on Railway Bogies and Running Gears<address><addrLine>Budapest: BUTE</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2010" />
+ <biblScope unit="page" from="41" to="50" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b84">
+ <monogr>
+ <title level="m" type="main">Influence of bogie to car body connection parameters on stability and curving of freight vehicle // Extended abstracts 6th international conference Railway bogies and running gears</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Boronenko</forename><forename type="middle">P</forename><surname>Yu</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">A</forename><forename type="middle">M</forename><surname>Orlova</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2004-09" />
+ <biblScope unit="page" from="23" to="25" />
+ <pubPlace>Budapest: BUTE</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b85">
+ <monogr>
+ <title level="m" type="main">Shear Stiffner Linkages for Radial Bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Scheffel</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><forename type="middle">H</forename><surname>Smit</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1997" />
+ <biblScope unit="volume">27</biblScope>
+ </imprint>
+ </monogr>
+ <note>Supplement to Vehicle System Dynamics</note>
+</biblStruct>
+
+<biblStruct xml:id="b86">
+ <analytic>
+ <title level="a" type="main">The influence of inter-axle linkages on stability and guidance of freight bogies</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Boronenko</forename><forename type="middle">P</forename><surname>Yu</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proceedings of the 8-th mini conference on vehicle system dynamics, identification and anomalies</title>
+ <meeting>the 8-th mini conference on vehicle system dynamics, identification and anomalies<address><addrLine>Budapest: BUTE</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="2002" />
+ <biblScope unit="page" from="175" to="182" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b87">
+ <analytic>
+ <title level="a" type="main">Suspension of freight wagon bogiewith the Lenoir friction damper ensuring low wear of wheels and good lateral dynamics of the wagon</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Piotrowski</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Pazdzierniak</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">T</forename><surname>Adamczewski</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Proc. of XVIII conference &apos;Pojazdy Szynow</title>
+ <meeting>of XVIII conference &apos;Pojazdy Szynow</meeting>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">I</biblScope>
+ <biblScope unit="page" from="199" to="211" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b88">
+ <analytic>
+ <title level="a" type="main">Wear and energy-saving freight bogie designs with rubber primary springs: principles and experiences</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">JRRT</title>
+ <imprint>
+ <biblScope unit="volume">227</biblScope>
+ <biblScope unit="page" from="105" to="110" />
+ <date type="published" when="2009" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b89">
+ <analytic>
+ <title level="a" type="main">Innovative Freight Wagons-A Precondition to increase the MarketShare of Rail Freight</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Archives of Transport</title>
+ <imprint>
+ <biblScope unit="volume">29</biblScope>
+ <biblScope unit="page" from="17" to="26" />
+ <date type="published" when="2014" />
+ </imprint>
+ </monogr>
+ <note>Polish Academy of Sciences, committee of Transport</note>
+</biblStruct>
+
+<biblStruct xml:id="b90">
+ <monogr>
+ <title level="m" type="main">A new design approach for railway vehicle suspension; Rail International</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>Scheffel</surname></persName>
+ </author>
+ <idno>1974. - -№10.-P. 638-651</idno>
+ <imprint/>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b91">
+ <monogr>
+ <title level="m" type="main">Project INFRA-RADIAL-bogies for axle loads of 25 t-test and simulation&apos;; XXI Century Rolling Stock: Ideas, Requirements, Projects Conference</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">W</forename><surname>Kik</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Scholdan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Stephanides</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2007" />
+ <pubPlace>St. Petersburg</pubPlace>
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b92">
+ <monogr>
+ <title level="m" type="main">Simulation of longitudinal dynamics of long freight trains in positioning operations Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Z</forename><surname>Qi</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Z</forename><surname>Huang</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">X</forename><surname>Kong</surname></persName>
+ </author>
+ <imprint>
+ <biblScope unit="volume">50</biblScope>
+ <biblScope unit="page" from="2012" to="1409" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b93">
+ <monogr>
+ <title level="m" type="main">Numerical and experimental approach for the evaluation of severe longitudinal dynamics of heavy freight trains Vehicle System Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Belforte</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">F</forename><surname>Celi</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">G</forename><surname>Diana</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">S</forename><surname>Melzi</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="2008" />
+ <biblScope unit="volume">46</biblScope>
+ <biblScope unit="page" from="937" to="955" />
+ </imprint>
+ </monogr>
+ <note>Supplement</note>
+</biblStruct>
+
+<biblStruct xml:id="b94">
+ <monogr>
+ <title level="m" type="main">Longitudinal train dynamics&apos; in Handbook of Railway Vehicle Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <imprint>
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+ <note>Iwnicki ed Taylor and Francis</note>
+</biblStruct>
+
+<biblStruct xml:id="b95">
+ <analytic>
+ <title level="a" type="main">An investigation of the effect of bogie and wagon pitch associated with longitudinal train dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Mcclanachan</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">C</forename><surname>Cole</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">D</forename><surname>Roach</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">B</forename><surname>Scown</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">The dynamics of vehicles on roads and on tracks Vehicle System Dynamics Supplement Swets &amp; Zeitlinger Amsterdam</title>
+ <imprint>
+ <date type="published" when="1999" />
+ <biblScope unit="page" from="374" to="385" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b96">
+ <analytic>
+ <title level="a" type="main">Recent advancements in bluff and draft testing techniques</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>El-Sibaie</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="m">Fifth International Heavy Haul Conference</title>
+ <meeting><address><addrLine>Beijing</addrLine></address></meeting>
+ <imprint>
+ <date type="published" when="1993" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b97">
+ <analytic>
+ <title level="a" type="main">White Paper Innovative Rail Freight Wagon 2030</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">R</forename><surname>Koenig</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">M</forename><surname>Hecht</surname></persName>
+ </author>
+ <ptr target="http://www.schienenfzg.tu-Berlin.de/fileadmin/fg62/Dokumente/Downloads/White_Paper_Innovative_Rail_Freight_Wagon_2030.pdf" />
+ </analytic>
+ <monogr>
+ <title level="j">TU Dresden TU Berlin</title>
+ <imprint>
+ <date type="published" when="2012" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b98">
+ <analytic>
+ <title level="a" type="main">On the Theory of Nonlinear Dynamics and its Applications in Vehicle Systems Dynamics</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">H</forename><surname>True</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Vehicle System Dynamics</title>
+ <imprint>
+ <biblScope unit="volume">31</biblScope>
+ <biblScope unit="issue">5-6</biblScope>
+ <biblScope unit="page" from="393" to="421" />
+ <date type="published" when="1999" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b99">
+ <analytic>
+ <title level="a" type="main">Numerical simulation of wheel wear evolution for heavy haul railway</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">P</forename><surname>Wang</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">L</forename><surname>Gao</surname></persName>
+ </author>
+ <idno type="doi">DOI:10.1007/s11771-015-2510-1</idno>
+ </analytic>
+ <monogr>
+ <title level="j">J. Cent. South Univ</title>
+ <imprint>
+ <biblScope unit="volume">22</biblScope>
+ <biblScope unit="page" from="196" to="207" />
+ <date type="published" when="2015" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python_hadoop/tests/files/example.cdx b/python_hadoop/tests/files/example.cdx
new file mode 100644
index 0000000..84e3271
--- /dev/null
+++ b/python_hadoop/tests/files/example.cdx
@@ -0,0 +1,20 @@
+edu,cmu,cs,adm,reports-archive)/anon/usr0/ftp/usr0/anon/2002/cmu-cs-02-119.pdf 20170706005950 http://reports-archive.adm.cs.cmu.edu/anon/usr0/ftp/usr0/anon/2002/CMU-CS-02-119.pdf application/pdf 200 MQHD36X5MNZPWFNMD5LFOYZSFGCHUN3V - - 361006 17120058 CITESEERX-CRAWL-2017-06-20-20170706004100259-00924-00932-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170706005946792-00926-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+fi,tkk,lib)/diss/2001/isbn951225459x/isbn951225459x.pdf 20170705074926 http://lib.tkk.fi/Diss/2001/isbn951225459X/isbn951225459X.pdf application/pdf 200 KJBCOT7LGBNIAVGEGPUELK5OK6RTFORR - - 344175 255650124 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,nar)/cgi/reprint/gkl1060v1.pdf 20170706035441 http://nar.oxfordjournals.org/cgi/reprint/gkl1060v1.pdf text/html 301 OX6MLVDFURLT2KSYCXUYW2PZNOVFSEVF - - 697 49346051 CITESEERX-CRAWL-2017-06-20-20170706034741172-00140-00149-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706035435634-00148-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,ifaamas)/proceedings/aamas09/pdf/01_full%20papers/02_08_fp_0272.pdf 20170706081902 http://www.ifaamas.org/Proceedings/aamas09/pdf/01_Full%20Papers/02_08_FP_0272.pdf application/pdf 200 GYHX35QJWRJELWJ5GDQZPTPOUUZOCTKF - - 251180 34635154 CITESEERX-CRAWL-2017-06-20-20170706081825105-00419-00428-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706081838210-00420-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,fau,cs)/publications/2014/lukas_14_masterthesis.pdf 20170705101722 http://www4.cs.fau.de/Publications/2014/lukas_14_masterthesis.pdf application/pdf 200 GIUQT7SXZ33TWEFBM2MWURJI2M3QE3IW - - 1290532 71068435 CITESEERX-CRAWL-2017-06-20-20170705101605019-00279-00288-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705101714659-00281-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+de,bund,jki,pub)/index.php/jabfq/article/download/3568/4462 20170706041152 http://pub.jki.bund.de/index.php/JABFQ/article/download/3568/4462/ text/html 301 XZBNO24W2ZPQQMJYE6YUUCSRUF7G3ZBT - - 552 417292708 CITESEERX-CRAWL-2017-06-20-20170706040506112-00160-00169-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706041021844-00165-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+whois://whois.arin.net/z+%2B+132.177.133.114 20170713120653 whois://whois.arin.net/z+%2B+132.177.133.114 text/plain - IDEID4YQ6MVJSOE57NPVDLL53ZB3J4DX - - 876 30983517 CITESEERX-CRAWL-2017-06-20-20170707064626094-01007-01015-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170711214025652-01014-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+za,co,csir,researchspace)/dspace/bitstream/10204/4048/1/smith2_2010.pdf 20170706094159 http://researchspace.csir.co.za/dspace/bitstream/10204/4048/1/Smith2_2010.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 446 104830407 CITESEERX-CRAWL-2017-06-20-20170706093829986-00509-00518-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706094137978-00512-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/article.aspx?articleid=705034 20170707013120 http://annals.org/article.aspx?articleid=705034 text/html 301 QQYKL57QSERLFM3LXSWMNOFXMOCN7C5G - - 22665 28113974 CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-00976-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707013100780-00967-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+org,annals)/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf 20170707045304 http://annals.org/pdfaccess.ashx?url=/data/journals/aim/20105/0000605-200512200-00013.pdf text/html 302 423S7EMGLCVIZ3FLVD7TLAG75HWE4RGI - - 644 222908628 CITESEERX-CRAWL-2017-06-20-20170707042504366-00997-01006-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170707045044604-00999-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+com,sagepub,spi)/content/28/4/501.full.pdf 20170705092027 http://spi.sagepub.com/content/28/4/501.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 396 553180242 CITESEERX-CRAWL-2017-06-20-20170705091311851-00219-00228-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705091759818-00223-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+ir,mediaj)/favicon.ico 20170705075240 http://mediaj.ir/favicon.ico text/html 404 E3WSNQ7JAFOW7N3ZJ6GLV27T52T25JDK - - 589 455827180 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705075051100-00135-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+com,sagepub,jpr)/content/8/3-4/239.full.pdf 20170705074931 http://jpr.sagepub.com/content/8/3-4/239.full.pdf unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 400 270368088 CITESEERX-CRAWL-2017-06-20-20170705074433815-00129-00138-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705074843696-00134-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+jp,co,nittuden)/business/pdf/transparent_thermoplastic_resin_with_electron_beam_cross-linking.pdf 20170706083459 http://www.nittuden.co.jp/business/pdf/Transparent_Thermoplastic_Resin_with_Electron_Beam_Cross-Linking.pdf application/pdf 200 V32E3CCO7NMI2M4OHLKG73DXD72LR4B2 - - 715081 761088410 CITESEERX-CRAWL-2017-06-20-20170706082646066-00429-00438-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706083257353-00436-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+lt,lms)/robots.txt 20170705122708 http://www.lms.lt/robots.txt text/plain 200 PF3HTQQT2ULYRWFLJGUWZKHTVZUVMZ2F - - 592 668333707 CITESEERX-CRAWL-2017-06-20-20170705121748408-00399-00408-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705122352502-00406-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+hu,bme,phy)/~szalay/pub/multipartcriteriaposter.pdf 20170705124828 http://www.phy.bme.hu/%7Eszalay/pub/multipartcriteriaPoster.pdf application/pdf 200 L3TUEEZLBJTHAVH74B5N426FAIDBCCOE - - 187866 964760782 CITESEERX-CRAWL-2017-06-20-20170705123641979-00419-00428-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705124315591-00426-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,adb,openaccess)/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf;jsessionid=f966a3bdac9882ec5a7c326b130f6f81?sequence=1 20170705090940 https://openaccess.adb.org/bitstream/handle/11540/1260/new-regime-sme-finance-asia.pdf%3Bjsessionid%3DF966A3BDAC9882EC5A7C326B130F6F81?sequence%3D1 unk 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 515 634039376 CITESEERX-CRAWL-2017-06-20-20170705090333400-00209-00218-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705090728803-00212-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,physiology,ajpregu)/content/272/4/r1084 20170706131006 http://ajpregu.physiology.org/content/272/4/R1084 text/html 200 3FOQSKT4WBYOUA6VKKJCEQCN6QF35ANT - - 27346 336293585 CITESEERX-CRAWL-2017-06-20-20170706130432396-00707-00716-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706130850866-00711-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
+de,desy,www-it)/common/documentation/cd-docs/sc2002/paperpdf/pap234.pdf 20170705121813 http://www-it.desy.de/common/documentation/cd-docs/SC2002/paperpdf/pap234.pdf application/pdf 200 BONCZ4NNGRNYR22ASFVU7VYTQ24RRNP4 - - 72421 381715704 CITESEERX-CRAWL-2017-06-20-20170705120827801-00389-00398-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705121708700-00397-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+org,oxfordjournals,bmb)/content/28/3/247.full.pdf 20170706014948 http://bmb.oxfordjournals.org/content/28/3/247.full.pdf text/html 301 EJWYVOPONJRARK7SGG6COFRN7CSTHROY - - 643 119398161 CITESEERX-CRAWL-2017-06-20-20170706014800946-00020-00029-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706014907678-00022-3671~wbgrp-svc285.us.archive.org~8443.warc.gz
diff --git a/python_hadoop/tests/files/example_grobid_metadata.json b/python_hadoop/tests/files/example_grobid_metadata.json
new file mode 100644
index 0000000..a2d18db
--- /dev/null
+++ b/python_hadoop/tests/files/example_grobid_metadata.json
@@ -0,0 +1,5 @@
+{"abstract": "In this paper an analytical model is presented for the Micro-Cantilever (MC) of Atomic Force Microscopy with Side Wall probe (AFM-SW) in the tapping excitation mode. In this model the couple motion of the MC is taken into account while the torsional motion is considered as an undesirable motion which is coupled with the vertical motion. To this end, the effect of several parameters, namely; probe mass, probe dislocation, sidewall extension length, and tip sample interaction force is investigated on the occurrence probability of torsional and vertical motions. It is found that the probe dislocation is the prerequisite factor of the undesired motion happening. For sake of validation, the analytical results are compared against the previously published results, and an excellent agreement is observed. Abstrak Dalam kertas ini, model analitikal dipersembahkan bagi micro-julur Mikroskop Daya Atom dengan prob dinding-sisi dan dalam mod pengujaan menoreh. Dalam model ini, gerakan pasangan bagi mikro-julur diambil kira manakala gerakan kilasan dianggap sebagai gerakan yang tidak diingini yang digandingkan dengan pergerakan menegak. Untuk tujuan ini , kesan daripada beberapa parameter, iaitu; jisim prob, kehelan prob, panjang lanjutan sisi, dan daya interaksi di antara tip dan sampel disiasat keatas kebarangkalian berlakunya gerakan kilasan dan menegak. Didapati bahawa kehelan prob adalah faktor prasyarat berlakunya gerakan yang tidak diingini. Untuk pengesahan, keputusan analisis ini dibandingkan dengan keputusan yang sebelum ini telah diterbitkan, dan didapati persetujuannya sangat baik. Kata kunci: Mokroskop daya atom, prob dind ing sisi, micro-jalur, getaran, gerakan pasangan", "acknowledgement": "Acknowledgement We are grateful for the UTM scholarship to Author 1. Authors gratefully acknowledge t he Research Institute of Petroleum Industry (RIPI) and the Iran Nanotechnology Laboratory Network (INLN) for their support.", "authors": [{"name": "Farzad Mokhtarinezhad"}, {"name": "Roslan Rahman"}, {"name": "Sina Eftekhar"}, {"name": "Sadegh Hassani"}], "citations": [{"authors": [{"name": "Julie Last"}, {"name": "Paul Russell"}, {"name": "P aul Nealey"}, {"name": "Christopher Murphy"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Investigative Ophthalmology & Visual Science", "publisher" : null, "title": "The applications of atomic force microscopy to vision science", "url": null, "volume": "51"}, {"authors": [{"name": "G Binnig"}, {"name": "C Quate"}, {"name": "C Geber"}], "date": "1986", "id": "b1", "index": 1, "issue": null, "journal": "Phys Rev Let", "publisher": null, "title": "Atomic force microscope", "url": null, "vol ume": "56"}, {"authors": [{"name": "C Wright"}, {"name": "Armstrong"}], "date": "2006", "id": "b2", "index": 2, "issue": null, "journal": "Surf Interface Anal", "publisher" : null, "title": "The application of atomic force microscopy force measurements to the characterisation of microbial surfaces", "url": null, "volume": "38"}, {"authors": [{ "name": "John Withers"}, {"name": "D Aston"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": "Advances in Colloid and Interface Science", "publisher": null, "title": "Nanomechanical measurements with AFM in the elastic limit", "url": null, "volume": "120"}, {"authors": [{"name": "Dara Bayat"}, {"name": "Terunobu Akiyama"}, {"name": "F Nicolaas"}, {"name": "Urs De Rooij"}, {"name": "Staufer"}], "date": "2008", "id": "b4", "index": 4, "issue": null, "journal": "Microelectronic Engineering", "p ublisher": null, "title": "Dynamic behavior of the tuning fork AFM probe", "url": null, "volume": "85"}, {"authors": [{"name": "M Kahrobaiyan"}, {"name": "M Ahmadian"}, {"name": "P Haghighi"}, {"name": "A Haghighi"}], "date": "2010", "id": "b5", "index": 5, "issue": null, "journal": "International Journal of Mechanical Sciences", "publisher": null, "title": "Sensitivity and resonant frequency of an AFM with sidewall and top-surface probes for both flexural and torsional modes", "url": null, "volume": "52"}, {"a uthors": [{"name": "Gaoliang Dai"}, {"name": "Helmut Wolff"}, {"name": "Frank Pohlenz"}, {"name": "Hans-Ulrich Danzebrink"}, {"name": "G5Cu00fcnter Wilkening"}], "date": "2006", "id": "b6", "index": 6, "issue": null, "journal": "APPLIED PHYSICS LETTERS", "publisher": null, "title": "Atomic force probe for sidewall scanning of nano-and micro structures", "url": null, "volume": "88"}, {"authors": [{"name": "Gaoliang Dai"}, {"name": "Helmutwolff"}, {"name": "Min Thomasweimann"}, {"name": "Frank Xu"}, {"name": "Ha ns-Ulrich Pohlenz"}, {"name": "Danzebrink"}], "date": "2007", "id": "b7", "index": 7, "issue": null, "journal": "Meas. Sci. Technol", "publisher": null, "title": "Nanoscale surface measurements at sidewalls of nanoand micro-structures", "url": null, "volume": "18"}, {"authors": [{"name": "Win-Jin Chang"}, {"name": "Haw-Long Lee"}, {"name": "T erry Yuan-Fang Chen"}], "date": "2008", "id": "b8", "index": 8, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Study of the sensitivity of the fi rst four flexural modes of an AFM cantilever with a sidewall probe", "url": null, "volume": "108"}, {"authors": [{"name": "Xiaohui Tang"}, {"name": "Vincent Bayot"}, {"name": "Nicolas Reckinger"}, {"name": "Denis Flandre"}, {"name": "Jean-Pierre Raskin"}, {"name": "Emmanuel Dubois"}, {"name": "Bernard Nysten"}], "date": "2009", "id": "b9", "i ndex": 9, "issue": null, "journal": "IEEE Transactions on Nanotechnogoly", "publisher": null, "title": "A Simple Method for Measuring Si-Fin Sidewall Roughness by AFM", "ur l": null, "volume": "8"}, {"authors": [{"name": "Ali Hossein Nejat Pishkenari"}, {"name": "Meghdari"}], "date": "2011", "id": "b10", "index": 10, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Influence of the tip mass on the tip-sample interactions in TM-AFM", "url": null, "volume": "111"}, {"authors": [{"name": "S ohrab Eslami"}, {"name": "Naderjalili"}], "date": "2012", "id": "b11", "index": 11, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A comprehensiv e modeling and vibration analysis of AFM microcantilevers subjected to nonlinear tip-sample interaction forces", "url": null, "volume": "117"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Journal of Applied Physics", "publisher": null, "title": "Couplin g of cantilever lateral bending and torsion in torsional resonance and lateral excitation modes of atomic force microscopy", "url": null, "volume": "99"}, {"authors": [{"name": "Haw-Long Lee"}, {"name": "Win-Jin Chang"}], "date": "2008", "id": "b13", "index": 13, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Couple d lateral bending-torsional vibration sensitivity of atomic force microscope cantilever", "url": null, "volume": "108"}, {"authors": [{"name": "Farzad Mokhtarinezhad"}], "d ate": "2015", "id": "b14", "index": 14, "issue": null, "journal": null, "publisher": null, "title": "Jurnal Teknologi (Sciences & Engineering)", "url": null, "volume": "76" }, {"authors": [{"name": "F Mokhtari-Nezhad"}, {"name": "A Saidi"}, {"name": "S Ziaei-Rad"}], "date": "2009", "id": "b15", "index": 15, "issue": null, "journal": "Ultramicr oscopy", "publisher": null, "title": "Influence of the tip mass and position on the AFM cantilever dynamics: Coupling between bending, torsion and flexural modes", "url": null, "volume": "109"}, {"authors": [{"name": "Arvind Raman"}, {"name": "John Melcher"}, {"name": "Ryan Tung"}], "date": "2008", "id": "b16", "index": 16, "issue": null, "jo urnal": "Nanotodays", "publisher": null, "title": "Cantilever dynamics in atomic force microscopy", "url": null, "volume": "3"}, {"authors": [{"name": "Nader Jalili"}, {"name": "Karthik Laxminarayana"}], "date": "2004", "id": "b17", "index": 17, "issue": null, "journal": "Mechatronic", "publisher": null, "title": "A review of atomic force mic roscopy imaging systems: application to molecular metrology and biological sciences", "url": null, "volume": "14"}, {"authors": [{"name": "B Derjaguin"}, {"name": "V Muller "}, {"name": "Y Toporov"}], "date": "1975", "id": "b18", "index": 18, "issue": null, "journal": "J. Colloid Interf. Sci", "publisher": null, "title": "Effect of contact def ormations on the adhesion of particles", "url": null, "volume": "53"}, {"authors": [{"name": "Yaxin Song"}, {"name": "Bharat Bhushan"}], "date": "2006", "id": "b19", "index ": 19, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "Simulation of dynamic modes of atomic force microscopy using a 3D finite element model", "u rl": null, "volume": "106"}, {"authors": [{"name": "K Johnson"}, {"name": "K Kendall"}, {"name": "A Roberts"}], "date": "1971", "id": "b20", "index": 20, "issue": null, "jo urnal": "Proc. R. Soc. London Ser. A", "publisher": null, "title": "Surface energy and the contact of elastic solids", "url": null, "volume": "324"}, {"authors": [{"name": "D Gorman"}], "date": "1975", "id": "b21", "index": 21, "issue": null, "journal": null, "publisher": null, "title": "Free Vibration Analysis of Beams and Shafts", "url": null, "volume": null}, {"authors": [{"name": "M Mahdavi"}, {"name": "A Farshidianfar"}, {"name": "M Tahani"}, {"name": "S Mahdavi"}, {"name": "H Dalir"}], "date": "2008", "id ": "b22", "index": 22, "issue": null, "journal": "Ultramicroscopy", "publisher": null, "title": "A more comprehensive modeling of atomic force microscope cantilever", "url" : null, "volume": "109"}, {"authors": [{"name": "M Reinstadtler"}, {"name": "U Rabe"}, {"name": "V Scherer"}, {"name": "U Hartmann"}, {"name": "A Goldade"}, {"name": "B Bhu shan"}, {"name": "W Arnold"}], "date": "2003", "id": "b23", "index": 23, "issue": null, "journal": "Applied physics letters", "publisher": null, "title": "On the nanoscale measurement of friction using atomic-force microscope cantilever torsional resonances", "url": null, "volume": "82"}, {"authors": [{"name": "M Reinst5Cu00e4dtler"}, {"name": "T Kasai"}, {"name": "U Rabe"}, {"name": "B Bhushan"}, {"name": "W Arnold"}], "date": "2005", "id": "b24", "index": 24, "issue": null, "journal": "Journal of Physics D: Applied Physics", "publisher": null, "title": "Imaging and measurement of elasticity and friction using the TRmode", "url": null, "volume": "38"}], "date": "2015", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": "76"}, "title": "Jurnal Teknologi Full Paper INVESTIGATION OF TORSI ONAL DEFLECTION AS AN UNDESIRED MOTION IN ATOMIC FORCE MICROSCOPY WITH SIDEWALL PROBE"}
+{"abstract": "Eight months after triple valve replacement with Bjork-Shiley tilting disc valves a patient developed symptoms and signs suggesting malfunction of the prosthesis in the tricuspid position. This was confirmed by echocardiography and angiocardiography, and at operation the di sc of the prosthesis was found to be stuck half-open by fibrin and clot. A further 11 patients with the same type of prosthesis in the tricuspid position were then studied by phonocardiography and echocardiography. In one of these the prosthesis was found to be stuck and this was confirmed by angiocardiography and surgery. These 2 cases are r eported in detail and thefindings in the other 10 are discussed. The implications of this high incidence of malfunction of the Bj6rk-Shiley prosthesis in the tricuspid posi tion are considered. Echocardiography appears to be essential in the follow-up of such patients.", "acknowledgement": null, "authors": [{"name": "P Bourdillon"}, {"name": " G Sharratt"}], "citations": [{"authors": [{"name": "J Assad-Morell"}, {"name": "A Tajik"}, {"name": "M Anderson"}, {"name": "R Tancredi"}, {"name": "R Wallace"}, {"name": " E Giuliani"}], "date": "1974", "id": "b0", "index": 0, "issue": null, "journal": "Mayo Clinic Proceedings", "publisher": null, "title": "Malfunctioning tricuspid valve pros thesis", "url": null, "volume": "49"}, {"authors": [{"name": "R Bache"}, {"name": "A From"}, {"name": "A Castaneda"}, {"name": "C Jorgensen"}, {"name": "Wang"}, {"name": "Y "}], "date": "1972", "id": "b1", "index": 1, "issue": null, "journal": "Chest", "publisher": null, "title": "Late thrombotic obstruction of Starr-Edwards tricuspid valve pr osthesis", "url": null, "volume": null}, {"authors": [{"name": "I Belenkie"}, {"name": "M Carr"}, {"name": "R Schlant"}, {"name": "D Nutter"}, {"name": "P Symbas"}], "date" : "1973", "id": "b2", "index": 2, "issue": null, "journal": "American Heart,Journal", "publisher": null, "title": "Malfunction of a Cutter Smeloff mitral ball valve prosthe sis: diagnosis by phonocardiography and echocardiography", "url": null, "volume": "86"}, {"authors": [{"name": "J Douglas"}, {"name": "Williams"}, {"name": "G"}], "date": " 1974", "id": "b3", "index": 3, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic evaluation of the Bjork-Shiley prosthetic valve", "ur l": null, "volume": "50"}, {"authors": [{"name": "J Gimenez"}, {"name": "W Winters"}, {"name": "Jr"}, {"name": "J Davila"}, {"name": "J Connell"}, {"name": "K Klein"}], "da te": "1965", "id": "b4", "index": 4, "issue": null, "journal": "American Journal of the Medical Sciences", "publisher": null, "title": "Dynamics of the StarrEdwards ball va lve prosthesis: a cine-fluorographic and ultrasonic study in humans", "url": null, "volume": "250"}, {"authors": [{"name": "M Johnson"}, {"name": "J Holmes"}, {"name": "Pat on"}, {"name": "B"}], "date": "1973", "id": "b5", "index": 5, "issue": null, "journal": "Circulation", "publisher": null, "title": "Echocardiographic determination of mitra l disc valve excursion", "url": null, "volume": "47"}, {"authors": [{"name": "M Johnson"}, {"name": "B Paton"}, {"name": "J Holmes"}], "date": "1970", "id": "b6", "index": 6, "issue": null, "journal": "Circulation", "publisher": null, "title": "Ultrasonic evaluation of prosthetic valve motion", "url": null, "volume": null}, {"authors": [{"name": "H Miller"}, {"name": "D Gibson"}, {"name": "J Stephens"}], "date": "1973", "id": "b7", "index": 7, "issue": null, "journal": "British Heart Journal", "publisher": null , "title": "Role of echocardiography and phonocardiography in diagnosis of mitral paraprosthetic regurgitation with Starr-Edwards prostheses", "url": null, "volume": "35"}, {"authors": [{"name": "P Oliva"}, {"name": "M Johnson"}, {"name": "M Pomerantz"}, {"name": "Levene"}, {"name": "A"}], "date": "1973", "id": "b8", "index": 8, "issue": null , "journal": "American journal of Cardiology", "publisher": null, "title": "Dysfunction of the Beall mitral prosthesis and its detection by cinefluoroscopy and echocardiogr aphy", "url": null, "volume": null}, {"authors": [{"name": "J Pfeifer"}, {"name": "N Goldschlager"}, {"name": "T Sweatman"}, {"name": "F Gerbode"}, {"name": "A Selzer"}], " date": "1972", "id": "b9", "index": 9, "issue": null, "journal": "American J7ournal of Cardiology", "publisher": null, "title": "Malfunction of mitral ball valve prosthesis due to thrombus: report of 2 cases with notes on early clinical diagnosis", "url": null, "volume": "29"}, {"authors": [{"name": "H Samaan"}, {"name": "R Murali"}], "date": "1970", "id": "b10", "index": 10, "issue": null, "journal": "Thorax", "publisher": null, "title": "Acute tricuspid valve obstruction following the use of tricuspid ball va lve prosthesis", "url": null, "volume": null}, {"authors": [{"name": "S Suwansirikul"}, {"name": "E Glassman"}, {"name": "F Raia"}, {"name": "F Spencer"}], "date": "1974", "id": "b11", "index": 11, "issue": null, "journal": "American J'ournal of Cardiology", "publisher": null, "title": "Late thrombosis of Starr-Edwards tricuspid ball valve pr osthesis", "url": null, "volume": "34"}, {"authors": [{"name": "Vander"}, {"name": "J Veer"}, {"name": "Jr"}, {"name": "G Rhyneer"}, {"name": "R Hodam"}, {"name": "F Kloste r"}], "date": "1971", "id": "b12", "index": 12, "issue": null, "journal": "Circulation", "publisher": null, "title": "Obstruction of tricuspid ball-valve prostheses", "url" : null, "volume": null}, {"authors": [{"name": "W Winters"}, {"name": "Jr"}, {"name": "J Gimenez"}, {"name": "L Soloff"}], "date": "1967", "id": "b13", "index": 13, "issue" : null, "journal": "American journal of Cardiology", "publisher": null, "title": "Clinical application of ultrasound in the analysis of prosthetic ball valve function", "ur l": null, "volume": "19"}, {"authors": [{"name": "P D V Requests For Reprints To Dr"}, {"name": "Western Bourdillon"}, {"name": "Hospital"}, {"name": "Oakley Road"}], "date ": false, "id": "b14", "index": 14, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}], "date": "1976", "doi": null, "journal": {"eissn": null, "issn": null, "issue": null, "name": "British Heart Journal", "publisher": null, "volume": "38"}, "title": "Malfunction of Bjork-Shiley valve prosthesis in tricuspid positi on"}
+{"abstract": "The interference is the major factor disrupting the sending of information in wireless networks. To ge t better performance for these networks as well in the conventional case as in cooperative one, all the necessary ways must be used to eliminate network interference. This article deals with the concept of Physical Layer Network Coding (PLNC). It is a way to exploit the operation of Network Coding (NC) that occurs naturally in the superimpose d electromagnetic waves (EM). It is a simple physical effect when several EM waves meet in the same physical space, they are mixed together. This mixture of EM waves is a f orm of NC produced by nature. Hence, the situation will be reversed and the interference will be a beneficial way to help the relay when sending information. This paper foc uses on the Symbol Error Rate (SER) Analysis of PLNC in the case of 16QAM modulator. It will exploit in detail the concept of mapping (modulation/demodulation) and will dem onstrate its contribution compared to NC and Traditional Network (TN).", "acknowledgement": "Conclusion In this paper, we took a brief description of different cases of coo perative networks in the case of TWRC. We describe the Traditional cooperative Networks, then the Network Coding, and finally, the Physical Layer Network Coding. This one a llows us to reduce the number of phases from 4 to 2. Furthermore, this paper illustrates that in PLNC and for the case of 16QAM constellation, the SER is lower than in the standard modulation case. This is verified with the modulation/demodulation study done and for the In-phase and quadrature case of the modulator.", "authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "citations": [{"authors": [{"name": "R Hajji"}, {"name": "N Hamdi"}], "date": "2012", "id": "b0", "index": 0, "issue": null, "journal": "IEEE Electrotechnical Conference (MELECON)", "publisher": null, "title": "Optimizing of Power Allocation for Two-Hop DF Relaying Systems", "url": null, "volume": null}, {"autho rs": [{"name": "J Proakis"}], "date": "1989", "id": "b1", "index": 1, "issue": null, "journal": null, "publisher": null, "title": "Digital Communication", "url": null, "vol ume": null}, {"authors": [{"name": "S Tian"}, {"name": "Li Yonghui"}, {"name": "B Vucetic"}], "date": "2011", "id": "b2", "index": 2, "issue": null, "journal": "IEEE ICC", "publisher": null, "title": "A Near Optimal Amplify and Forward Relaying in Two-Way Relay Networks", "url": null, "volume": null}, {"authors": [{"name": "S Zhang"}, {"name" : "S Liew"}, {"name": "P Lam"}], "date": "2006", "id": "b3", "index": 3, "issue": null, "journal": null, "publisher": null, "title": "Physical Layer Network Coding. ACM Mob iCom", "url": null, "volume": null}], "date": "2013", "doi": null, "journal": {"eissn": null, "issn": null, "issue": "3", "name": "AWERProcedia Information Technology & Com puter Science", "publisher": null, "volume": "03"}, "title": "SER Analysis of Two-Hop Physical Layer Network Coding with 16QAM Modulator, AWERProcedia Information Technolog y & Computer Science"}
+{"abstract": "Suffix trees are by far the most important data structure in stringology, with myriads of applications in fields like bioinformatics and information retrieval. Classical representations of suffix trees require O(n log n) bits of space, for a string of size n. This is consid erably more than the n log 2 5Cu03c3 bits needed for the string itself, where 5Cu03c3 is the alphabet size. The size of suffix trees has been a barrier to their wider a doption in practice. Recent compressed suffix tree representations require just the space of the compressed string plus 5Cu0398(n) extra bits. This is already spectacular , but still unsatisfactory when 5Cu03c3 is small as in DNA sequences. In this paper we introduce the first compressed suffix tree representation that breaks this linear-s pace barrier. Our representation requires sublinear extra space and supports a large set of navigational operations in logarithmic time. An essential ingredient of our repr esentation is the lowest common ancestor (LCA) query. We reveal important connections between LCA queries and suffix tree navigation.", "acknowledgement": null, "authors": [{"name": "Lu5Cu00eds Russo"}, {"name": "Gonzalo Navarro"}, {"name": "Arlindo Oliveira"}], "citations": [{"authors": [{"name": "A Apostolico"}], "date": "1985", "id": "b0 ", "index": 0, "issue": null, "journal": "Combinatorial Algorithms on Words. NATO ISI Series", "publisher": null, "title": "The myriad virtues of subword trees", "url": null, "volume": null}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2000", "id": "b1", "index": 1, "issue": null, "journal": "Proceedings of LATIN ", "publisher": null, "title": "The LCA problem revisited", "url": null, "volume": "1776"}, {"authors": [{"name": "M Bender"}, {"name": "M Farach-Colton"}], "date": "2004", "id": "b2", "index": 2, "issue": "1", "journal": "Theor. Comp. Sci", "publisher": null, "title": "The level ancestor problem simplified", "url": null, "volume": "321"}, {" authors": [{"name": "M Farach"}], "date": "1997", "id": "b3", "index": 3, "issue": null, "journal": "Proceedings of FOCS", "publisher": null, "title": "Optimal suffix tree construction with large alphabets", "url": null, "volume": null}, {"authors": [{"name": "P Ferragina"}, {"name": "G Manzini"}, {"name": "V M5Cu00e4kinen"}, {"name": "G Na varro"}], "date": "2007", "id": "b4", "index": 4, "issue": "2", "journal": "ACM Trans. Algor", "publisher": null, "title": "Compressed representations of sequences and full -text indexes", "url": null, "volume": "3"}, {"authors": [{"name": "J Fischer"}, {"name": "V Heun"}], "date": "2007", "id": "b5", "index": 5, "issue": null, "journal": "Pro ceedings of ESCAPE", "publisher": null, "title": "A new succinct representation of RMQ-information and improvements in the enhanced suffix array", "url": null, "volume": "4 614"}, {"authors": [{"name": "L Foschini"}, {"name": "R Grossi"}, {"name": "A Gupta"}, {"name": "J Vitter"}], "date": "2006", "id": "b6", "index": 6, "issue": "4", "journal ": "ACM Trans. Algor", "publisher": null, "title": "When indexing equals compression: Experiments with compressing suffix arrays and applications", "url": null, "volume": " 2"}, {"authors": [{"name": "R Geary"}, {"name": "R Raman"}, {"name": "V Raman"}], "date": "2004", "id": "b7", "index": 7, "issue": null, "journal": "Proceedings of SODA", " publisher": null, "title": "Succinct ordinal trees with level-ancestor queries", "url": null, "volume": null}, {"authors": [{"name": "R Giegerich"}, {"name": "S Kurtz"}, {"name": "J Stoye"}], "date": "2003", "id": "b8", "index": 8, "issue": "11", "journal": "Softw., Pract. Exper", "publisher": null, "title": "Efficient implementation of lazy suffix trees", "url": null, "volume": "33"}, {"authors": [{"name": "D Gusfield"}], "date": "1997", "id": "b9", "index": 9, "issue": null, "journal": null, "publisher": null , "title": "Algorithms on Strings, Trees and Sequences", "url": null, "volume": null}, {"authors": [{"name": "D Knuth"}, {"name": "J"}, {"name": "V Pratt"}], "date": "1977" , "id": "b10", "index": 10, "issue": "2", "journal": "SIAM J. Comput", "publisher": null, "title": "Fast pattern matching in strings", "url": null, "volume": "6"}, {"author s": [{"name": "S Lee"}, {"name": "K Park"}], "date": "2007", "id": "b11", "index": 11, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic rank-select structures with applications to run-length encoded texts", "url": null, "volume": "4580"}, {"authors": [{"name": "V M5Cu00e4kinen"}, {"name": "G Navarro"}], " date": "2006", "id": "b12", "index": 12, "issue": null, "journal": "Proceedings of CPM", "publisher": null, "title": "Dynamic entropy-compressed sequences and full-text ind exes", "url": null, "volume": "4009"}, {"authors": [{"name": "U Manber"}, {"name": "E Myers"}], "date": "1993", "id": "b13", "index": 13, "issue": "5", "journal": "SIAM J. Comput", "publisher": null, "title": "Suffix arrays: A new method for on-line string searches", "url": null, "volume": "22"}, {"authors": [{"name": "G Manzini"}], "date": " 2001", "id": "b14", "index": 14, "issue": "3", "journal": "J. ACM", "publisher": null, "title": "An analysis of the Burrows-Wheeler transform", "url": null, "volume": "48"} , {"authors": [{"name": "E Mccreight"}], "date": "1976", "id": "b15", "index": 15, "issue": "2", "journal": "J. ACM", "publisher": null, "title": "A space-economical suffix tree construction algorithm", "url": null, "volume": "32"}, {"authors": [{"name": "G Navarro"}, {"name": "V M5Cu00e4kinen"}], "date": "2007", "id": "b16", "index": 16, " issue": "1", "journal": "ACM Comp. Surv", "publisher": null, "title": "Compressed full-text indexes", "url": null, "volume": "39"}, {"authors": [{"name": "R Raman"}, {"name": "V Raman"}, {"name": "S Rao"}], "date": "2002", "id": "b17", "index": 17, "issue": null, "journal": "Proceedings of SODA", "publisher": null, "title": "Succinct indexabl e dictionaries with applications to encoding k-ary trees and multisets", "url": null, "volume": null}, {"authors": [{"name": "L Russo"}, {"name": "A Oliveira"}], "date": "2 006", "id": "b18", "index": 18, "issue": null, "journal": "Proceedings of SPIRE", "publisher": null, "title": "A compressed self-index using a Ziv-Lempel dictionary", "url" : null, "volume": "4209"}, {"authors": [{"name": "K Sadakane"}], "date": "2003", "id": "b19", "index": 19, "issue": "2", "journal": "J. of Algorithms", "publisher": null, " title": "New text indexing functionalities of the compressed suffix arrays", "url": null, "volume": "48"}, {"authors": [{"name": "K Sadakane"}], "date": "2007", "id": "b20" , "index": 20, "issue": null, "journal": "Theo. Comp. Sys", "publisher": null, "title": "Compressed Suffix Trees with Full Functionality", "url": null, "volume": null}, {"a uthors": [{"name": "E Ukkonen"}], "date": "1995", "id": "b21", "index": 21, "issue": "3", "journal": "Algorithmica", "publisher": null, "title": "Construting suffix trees o n-line in linear time", "url": null, "volume": "14"}, {"authors": [{"name": "N V5Cu00e4lim5Cu00e4ki"}, {"name": "W Gerlach"}, {"name": "K Dixit"}, {"name": "V M5Cu00e 4kinen"}], "date": "2007", "id": "b22", "index": 22, "issue": null, "journal": "Proceedings of WEA", "publisher": null, "title": "Engineering a compressed suffix tree imple mentation", "url": null, "volume": "4525"}, {"authors": [{"name": "P Weiner"}], "date": "1973", "id": "b23", "index": 23, "issue": null, "journal": "Proceedings of IEEE Sym p. on Switching and Automata Theory", "publisher": null, "title": "Linear pattern matching algorithms", "url": null, "volume": null}], "date": false, "doi": null, "journal" : {"eissn": null, "issn": null, "issue": null, "name": null, "publisher": null, "volume": null}, "title": "Fully-Compressed Suffix Trees"}
+{"abstract": null, "acknowledgement": null, "authors": [{"name": "Carine Van Huls Van Taxis"}, {"name": "Sebastiaan Piers"}, {"name": "Marta De Riva Silva"}, {"name": "Olaf Dekkers"}, {"name": "Dani5Cu00ebl Pijnappels"}, {"name": "Martin Schalij"}, {"name": "Adrianus Wijnmaalen"}, {"name": "Katja Zeppenfeld"}], "citations": [{"authors": [{"name": "T Baman"}, {"name": "D Lange"}, {"name": "K Ilg"}, {"name": "S Gupta"}, {"name": "T Liu"}, {"name": "C Algui re"}, {"name": "W Armstrong"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "K Jongnarangsin"}, {"name": "F Pelosi"}, {"name": "Jr Crawford"}, {"name": "T Ebinger"}, { "name": "M Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2010", "id": "b0", "index": 0, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Relationship between burden of premature ventricular complexes and left ventricular function", "url": null, "volume": "7"}, {"authors": [{"name": "M Yokoka wa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Alguire"}, {"name": "C Armstrong"}, {"name": "W Crawford"}, {"name": "T Jongnarangsin"}, {"name": "K Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": "b1", "index": 1, "issue": null, "journal": "Heart Rh ythm", "publisher": null, "title": "Relation of symptoms and symptom duration to premature ventricular complex-induced cardiomyopathy", "url": null, "volume": "9"}, {"autho rs": [{"name": "M Yokokawa"}, {"name": "H Kim"}, {"name": "E Good"}, {"name": "T Crawford"}, {"name": "A Chugh"}, {"name": "F Pelosi"}, {"name": "Jr Jongnarangsin"}, {"name": "K Latchamsetty"}, {"name": "R Armstrong"}, {"name": "W Alguire"}, {"name": "C Oral"}, {"name": "H Morady"}, {"name": "F Bogun"}, {"name": "F"}], "date": "2012", "id": " b2", "index": 2, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Impact of QRS duration of frequent premature ventricular complexes on the developmen t of cardiomyopathy", "url": null, "volume": "9"}, {"authors": [{"name": "P Carballeira"}, {"name": "M Deyell"}, {"name": "D Frankel"}, {"name": "D Benhayon"}, {"name": "F Squara"}, {"name": "W Chik"}, {"name": "M Kohari"}, {"name": "R Deo"}, {"name": "F Marchlinski"}], "date": "2014", "id": "b3", "index": 3, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Ventricular premature depolarization QRS duration as a new marker of risk for the development of ventricular premature depolarization- induced cardiomyopathy", "url": null, "volume": "11"}, {"authors": [{"name": "E Aliot"}, {"name": "W Stevenson"}, {"name": "J Almendral-Garrote"}, {"name": "F Bogun"}, {"name": "C Calkins"}, {"name": "E Delacretaz"}, {"name": "B Della"}, {"name": "G Hindricks"}, {"name": "P Jais"}, {"name": "M Josephson"}, {"name": "J Kautzner"}, {"name": "G Kay"}, {"name": "K Kuck"}, {"name": "B Lerman"}, {"name": "F Marchlinski"}, {"name": "V Reddy"}, {"name": "M Schalij"}, {"name": "R Schilling"}, {"name": "K Soejima"}, {"name": "Wilber Ehra/"}], "date": false, "id": "b4", "index": 4, "issue": null, "journal": null, "publisher": null, "title": "HRS Expert Consensus on Catheter Ablation of Ventricular Arrhythmias: developed in a partnership with the European Heart Rhythm Association (EHRA), a Registered Branch of the European Society of Cardiology (ESC), and the Heart Rhythm Society", "url": null, "volume": null}, {"authors": [{"name": "B Ts"}, {"name": "S"}, {"name": "S Ilg"}, {"name": "K Gupta"}, {"name": "S Liu"}, {"name": "T T Ty Y Alguire"}, {"name": "C"}, {"name": "Ar Ar Rms M Tron On Ong"}, {"name": "W Good"}, {"name": "E Chugh"}, {"name": "A A"}, {"name": "A"}, {"name": "J J Jongnaran N Ngs G s Gsin In In K K K"}, {"name": "Pe Pe Pelo Lo Losi Si Si"}, {"name": "F"}, {"name": ","}, {"name": "Jr"}], "date": false, "id": "b5", "index": 5, "issue": null, "journal": null, "publisher": null, "title": "Cra ra rawf wf wfor o o d d d T, T, T, Ebi i in ng ger r M M M", "url": null, "volume": null}, {"authors": [{"name": "M"}, {"name": "M"}, {"name": "M Kim M M H H Hm"}, {"name": "M Goo Oo Od"}, {"name": "E Chugh G G A A A, Pe Pe Pelo Lo Losi S"}, {"name": "F Jr R R. ; W"}, {"name": "Crawford T ; Mo Mo Mora Ra Rady Dy Dy F F F"}, {"name": "Bo Bogu Gu Gun N"}, {"name": "F"}], "date": "2012", "id": "b6", "index": 6, "issue": null, "journal": null, "publisher": null, "title": "Rel el elat at atio io ion n n of of of s s sym ym ympt pt ptom om oms s s an an and d d sy sy symp mp mpto to tom m m du du dur r ration o o o p p pre re rema ma matu tu ture re re v v ve e ent nt ntri ri ricu cu cula la ar r r co co comp mp mple le ex x x-i ind nd duc uc uced", "url": null, "volume": "20"}, {"authors": [{"name": "Cardiology Col lege Of"}], "date": "2009", "id": "b7", "index": 7, "issue": null, "journal": null, "publisher": null, "title": "ACC) and the American Heart Association (AHA). Heart Rhythm ", "url": null, "volume": "6"}, {"authors": [{"name": "D Zipes"}, {"name": "A Camm"}, {"name": "M Borggrefe"}, {"name": "A Buxton"}, {"name": "B Chaitman"}, {"name": "M Fro mer"}, {"name": "G Gregoratos"}, {"name": "G Klein"}, {"name": "A Moss"}, {"name": "R Myerburg"}, {"name": "S Priori"}, {"name": "M Quinones"}, {"name": "D Roden"}, {"name" : "M Silka"}, {"name": "C Tracy"}, {"name": "S Smith"}, {"name": "Jr Jacobs"}, {"name": "A Adams"}, {"name": "C Antman"}, {"name": "E Anderson"}, {"name": "J Hunt"}, {"name": "S Halperin"}, {"name": "J Nishimura"}, {"name": "R Ornato"}, {"name": "J Page"}, {"name": "R Riegel"}, {"name": "B Priori"}, {"name": "S Blanc"}, {"name": "J Budaj"}, { "name": "A Camm"}, {"name": "A Dean"}, {"name": "V Deckers"}, {"name": "J Despres"}, {"name": "C Dickstein"}, {"name": "K Lekakis"}, {"name": "J Mcgregor"}, {"name": "K Met ra"}, {"name": "M Morais"}, {"name": "J Osterspey"}, {"name": "A Tamargo"}, {"name": "J Zamorano"}, {"name": "J"}], "date": "2006", "id": "b8", "index": 8, "issue": null, " journal": "J Am Coll Cardiol", "publisher": null, "title": "ACC/AHA/ESC 2006 guidelines for management of patients with ventricular arrhythmias and the prevention of sudden cardiac death: a report of the American College of Cardiology/American Heart Association Task Force and the European Society of Cardiology Committee for Practice Guideline s (Writing Committee to Develop Guidelines for Management of Patients With Ventricular Arrhythmias and the Prevention of Sudden Cardiac Death)", "url": null, "volume": "48" }, {"authors": [{"name": "Y Sekiguchi"}, {"name": "K Aonuma"}, {"name": "Y Yamauchi"}, {"name": "T Obayashi"}, {"name": "A Niwa"}, {"name": "H Hachiya"}, {"name": "A Takaha shi"}, {"name": "J Nitta"}, {"name": "Y Iesaka"}, {"name": "M Isobe"}], "date": "2005", "id": "b9", "index": 9, "issue": null, "journal": "J Cardiovasc Electrophysiol", "pu blisher": null, "title": "Chronic hemodynamic effects after radiofrequency catheter ablation of frequent monomorphic ventricular premature beats", "url": null, "volume": "1 6"}, {"authors": [{"name": "H Tada"}, {"name": "S Ito"}, {"name": "G Shinbo"}, {"name": "K Tadokoro"}, {"name": "I Ito"}, {"name": "T Hashimoto"}, {"name": "K Miyaji"}, {"name": "K Kaseno"}, {"name": "S Naito"}, {"name": "A Nogami"}, {"name": "S Oshima"}, {"name": "K Taniguchi"}], "date": "2006", "id": "b10", "index": 10, "issue": null, "jour nal": "Pacing Clin Electrophysiol", "publisher": null, "title": "Significance and utility of plasma brain natriuretic peptide concentrations in patients with idiopathic ven tricular arrhythmias", "url": null, "volume": "29"}, {"authors": [{"name": "F Knebel"}, {"name": "I Schimke"}, {"name": "K Pliet"}, {"name": "S Schattke"}, {"name": "S Mart in"}, {"name": "A Borges"}, {"name": "G Baumann"}], "date": "2005", "id": "b11", "index": 11, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "NT-ProBN P in acute heart failure: correlation with invasively measured hemodynamic parameters during recompensation", "url": null, "volume": "11"}, {"authors": [{"name": "R Krittay aphong"}, {"name": "T Boonyasirinant"}, {"name": "P Saiviroonporn"}, {"name": "P Thanapiboonpol"}, {"name": "S Nakyen"}, {"name": "S Udompunturak"}], "date": "2008", "id": "b12", "index": 12, "issue": null, "journal": "J Card Fail", "publisher": null, "title": "Correlation Between NT-pro BNP levels and left ventricular wall stress, sphericity index and extent of myocardial damage: a magnetic resonance imaging study", "url": null, "volume": "14"}, {"authors": [{"name": "S Yuda"}, {"name": "V Khoury"}, {"name": " T Marwick"}], "date": "2002", "id": "b13", "index": 13, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Influence of wall stress and left ventri cular geometry on the accuracy of dobutamine stress echocardiography", "url": null, "volume": "40"}, {"authors": [{"name": "L Krupp"}, {"name": "N Larocca"}, {"name": "J Mu ir-Nash"}, {"name": "A Steinberg"}], "date": "1989", "id": "b14", "index": 14, "issue": null, "journal": "Arch Neurol", "publisher": null, "title": "The fatigue severity sc ale. Application to patients with multiple sclerosis and systemic lupus erythematosus", "url": null, "volume": "46"}, {"authors": [{"name": "F Gustafsson"}, {"name": "F Ste ensgaard-Hansen"}, {"name": "J Badskjaer"}, {"name": "A Poulsen"}, {"name": "P Corell"}, {"name": "P Hildebrandt"}], "date": false, "id": "b15", "index": 15, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "Tad Da"}, {"name": "H"}, {"name": ", Ito To"}, {"name": "S"}, {"name": ", Sh S Inbo" }, {"name": "G Tadokoro"}, {"name": "K Ito"}, {"name": "I"}, {"name": "Has Shi Himoto"}, {"name": "T Miyaji"}, {"name": "K"}, {"name": "Ka"}, {"name": "K Naito"}, {"name": "S No O Oga Ga Gam Mi Mi A A A, O Os Oshi Hi Im M Ma"}, {"name": "S Taniguchi"}, {"name": "K"}], "date": false, "id": "b16", "index": 16, "issue": null, "journal": null, "p ublisher": null, "title": "onc nc centrations n n i i in n n pa p p ti ti tien en ents ts ts w w with h h id id idio io iopa pa path th thic ic ic v v ven en entr tr tricul lar r r arr rr rhy hy hyth th thmi m m as as as", "url": null, "volume": null}, {"authors": [{"name": "K K Kne Ne Nebe"}, {"name": "F"}, {"name": "F"}, {"name": "F"}, {"name": "S S Sch Ch Chim Im Mke Ke Ke I"}, {"name": ","}, {"name": "Pl Pl P Iet T K"}, {"name": "K"}, {"name": "K"}, {"name": "S S Sch Ch Chattk Tk Ke E S"}, {"name": "S"}, {"name": "S Ti Tin N S"}, {"name": "S"}, {"name": "S B B"}], "date": false, "id": "b17", "index": 17, "issue": null, "publisher": null, "title": null, "url": null, "volume": null}, {"authors": [{"name": "S Vickery"}, {"name": "C Price"}, {"name": "R John"}, {"name": "N Abbas"}, {"name": "M Webb"}, {"name": "M Kempson"}, {"name": "E Lamb"}], "da te": "2005", "id": "b18", "index": 18, "issue": null, "journal": "Am J Kidney Dis", "publisher": null, "title": "B-type natriuretic peptide (BNP) and amino-terminal proBNP in patients with CKD: relationship to renal function and left ventricular hypertrophy", "url": null, "volume": "46"}, {"authors": [{"name": "C Van Huls Van Taxis"}, {"name" : "A Wijnmaalen"}, {"name": "D Den Uijl"}, {"name": "M Gawrysiak"}, {"name": "H Putter"}, {"name": "M Schalij"}, {"name": "K Zeppenfeld"}], "date": "2011", "id": "b19", "in dex": 19, "issue": null, "journal": "Heart Rhythm", "publisher": null, "title": "Reversed polarity of bipolar electrograms for predicting a successful ablation site in foca l idiopathic right ventricular outflow tract arrhythmias", "url": null, "volume": "8"}, {"authors": [{"name": "D Penela"}, {"name": "C Van Huls Van Taxis"}, {"name": "L Agu inaga"}, {"name": "J Fernandez-Armenta"}, {"name": "L Mont"}, {"name": "M Castel"}, {"name": "M Heras"}, {"name": "J Tolosana"}, {"name": "M Sitges"}, {"name": "A Ordonez"} , {"name": "J Brugada"}, {"name": "K Zeppenfeld"}, {"name": "A Berruezo"}], "date": "2013", "id": "b20", "index": 20, "issue": null, "journal": "J Am Coll Cardiol", "publis her": null, "title": "Neurohormonal, structural, and functional recovery pattern after premature ventricular complex ablation is independent of structural heart disease sta tus in patients with depressed left ventricular ejection fraction: a prospective multicenter study", "url": null, "volume": "62"}, {"authors": [{"name": "S Niwano"}, {"name": "Y Wakisaka"}, {"name": "H Niwano"}, {"name": "H Fukaya"}, {"name": "S Kurokawa"}, {"name": "M Kiryu"}, {"name": "Y Hatakeyama"}, {"name": "T Izumi"}], "date": "2009", " id": "b21", "index": 21, "issue": null, "journal": "Heart", "publisher": null, "title": "Prognostic significance of frequent premature ventricular contractions originating from the ventricular outflow tract in patients with normal left ventricular function", "url": null, "volume": "95"}, {"authors": [{"name": "L Costello-Boerrigter"}, {"name" : "G Boerrigter"}, {"name": "M Redfield"}, {"name": "R Rodeheffer"}, {"name": "L Urban"}, {"name": "D Mahoney"}, {"name": "S Jacobsen"}, {"name": "D Heublein"}, {"name": "J Burnett"}], "date": "2006", "id": "b22", "index": 22, "issue": null, "journal": "J Am Coll Cardiol", "publisher": null, "title": "Amino-terminal pro-B-type natriuretic pep tide and B-type natriuretic peptide in the general community: determinants and detection of left ventricular dysfunction", "url": null, "volume": "47"}, {"authors": [{"name": "L Co Ostello-Boe Err Rrigter"}, {"name": "G Boerrigter"}, {"name": "Redfield"}, {"name": "M Mm"}, {"name": "R Rodeheffer"}, {"name": ", Ur U Ban"}, {"name": "L Mahoney" }, {"name": "Dw W W"}, {"name": ","}, {"name": "Ja Jacobs Bs Bsen En En"}, {"name": "S S Sj J J Heublein"}, {"name": "D Burnett"}, {"name": "J"}], "date": false, "id": "b23 ", "index": 23, "issue": null, "journal": "mu mu munity ty y: : : de de dete te term rm min in inan", "publisher": null, "title": "Am Am Amino-terminal p p pro r r-B-ty ty type natriuretic peptide an an nd d B-type n na at atri ri riur ur", "url": null, "volume": null}], "date": false, "doi": "10.1161/circep.115.003091", "journal": {"eissn": "1941-3084", "issn": "1941-3149", "issue": null, "name": "Circulation: Arrhythmia and Electrophysiology", "publisher": "Ovid Technologies (Wolters Kluwer Health)", "volume" : null}, "title": "Fatigue as Presenting Symptom and a High Burden of Premature Ventricular Contractions Are Independently Associated with Increased Ventricular Wall Stress in Patients with Normal Left Ventricular Function"}
diff --git a/python_hadoop/tests/files/example_ungrobided.tsv b/python_hadoop/tests/files/example_ungrobided.tsv
new file mode 100644
index 0000000..9263b6f
--- /dev/null
+++ b/python_hadoop/tests/files/example_ungrobided.tsv
@@ -0,0 +1,20 @@
+sha1:23LOSW2QVMKUYXPFZBXQHBBNQR45WTMU {"c": 1, "d": "2017-10-27T22:21:13", "f": "PDFS-20171027214658-00155.warc.gz", "o": 984263791, "u": "http://circ.ahajournals.org/content/circulationaha/53/6/965.full.pdf"} application/pdf {"c_size": 1050532, "dt": "20171027222113", "offset": 984263791, "surt": "org,ahajournals,circ)/content/circulationaha/53/6/965.full.pdf", "url": "http://circ.ahajournals.org/content/circulationaha/53/6/965.full.pdf", "warc": "PDFS-20171027125450-crawl815/PDFS-20171027214658-00155.warc.gz"}
+sha1:23M2N262M5TWB7F3BVB6ESD3Q26SMPFA {"c": 1, "d": "2012-09-29T07:05:16", "f": "ARCHIVEIT-219-QUARTERLY-FWGZDI-20120929065657-00119-crawling203.us.archive.org-6680.warc.gz", "o": 83570746, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=37&id=225&artlang=en"} application/pdf {"c_size": 3590, "dt": "20120929070516", "offset": 83570746, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=37&id=225", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=37&id=225&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-FWGZDI-00001/ARCHIVEIT-219-QUARTERLY-FWGZDI-20120929065657-00119-crawling203.us.archive.org-6680.warc.gz"}
+sha1:23MFQLDGP4WJD67BS7ERMYQUF7TGCG5X {"c": 1, "d": "2017-08-25T15:19:28", "f": "MSAG-PDF-CRAWL-2017-08-04-20170825143335512-08107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz", "o": 573475485, "u": "http://www.bloodjournal.org/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true"} application/pdf {"c_size": 3411470, "dt": "20170825151928", "offset": 573475485, "surt": "org,bloodjournal)/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true", "url": "http://www.bloodjournal.org/content/bloodjournal/77/7/1484.full.pdf?sso-checked=true", "warc": "MSAG-PDF-CRAWL-2017-08-04-20170825114428485-08102-08111-wbgrp-svc284/MSAG-PDF-CRAWL-2017-08-04-20170825143335512-08107-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"}
+sha1:23MG6K5Z3JENYCZ2OGTNOJ7QPYANJRCZ {"c": 1, "d": "2017-10-10T11:29:50", "f": "PDFS-20171010110639-00278.warc.gz", "o": 665222706, "u": "http://circ.ahajournals.org/content/circulationaha/53/5/797.full.pdf?download=true"} application/pdf {"c_size": 1107121, "dt": "20171010112950", "offset": 665222706, "surt": "org,ahajournals,circ)/content/circulationaha/53/5/797.full.pdf?download=true", "url": "http://circ.ahajournals.org/content/circulationaha/53/5/797.full.pdf?download=true", "warc": "PDFS-20171010110639-crawl815/PDFS-20171010110639-00278.warc.gz"}
+sha1:23MN67JQKDWRUXJMXXJ2GX6O43SQIV76 {"c": 1, "d": "2015-06-06T18:17:08", "f": "ARCHIVEIT-219-QUARTERLY-9582-20150606125000169-00071-wbgrp-crawl067.us.archive.org-6440.warc.gz", "o": 603211220, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=95&artlang=en"} application/pdf {"c_size": 3450, "dt": "20150606181708", "offset": 603211220, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=95", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=95&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-9582-00007/ARCHIVEIT-219-QUARTERLY-9582-20150606125000169-00071-wbgrp-crawl067.us.archive.org-6440.warc.gz"}
+sha1:23MQCEOMQS5SMZCXIPNQ4E3ZKOCF6DIM {"c": 1, "d": "2004-01-02T19:45:22", "f": "DU_crawl10.20040102194453.arc.gz", "o": 38062592, "u": "http://www.csupomona.edu:80/~darsadmin/ACTIONITEMS.pdf"} application/pdf {"c_size": 15406, "dt": "20040102194522", "offset": 38062592, "surt": "edu,csupomona)/~darsadmin/actionitems.pdf", "url": "http://www.csupomona.edu:80/~darsadmin/ACTIONITEMS.pdf", "warc": "DU_crawl10.20040102181929-c/DU_crawl10.20040102194453.arc.gz"}
+sha1:23NKO4TW6XCESXMSUOOICI3AXVK6Z5BL {"c": 1, "d": "2015-04-27T11:16:33", "f": "eric.ed.gov-inf-20150409-030712-1648j-00064.warc.gz", "o": 3872820483, "u": "http://files.eric.ed.gov/fulltext/ED088632.pdf"} application/pdf {"c_size": 2223528, "dt": "20150427111633", "offset": 3872820483, "surt": "gov,ed,eric,files)/fulltext/ed088632.pdf", "url": "http://files.eric.ed.gov/fulltext/ED088632.pdf", "warc": "archiveteam_archivebot_go_20150427150006/eric.ed.gov-inf-20150409-030712-1648j-00064.warc.gz"}
+sha1:23NW2EPLXDA6UBIJLQMM2DJ2K3GL3WTB {"c": 1, "d": "2014-08-13T09:04:30", "f": "WIDE-20140813084304-09684.warc.gz", "o": 726289594, "u": "http://research.sdccd.edu/docs/Accreditation/2012%20Surveys/Employee%20-%20Briefing/Mesa%20College%202012%20Employee%20Feedback%20Survey%20Briefing.pdf"} application/pdf {"c_size": 3472527, "dt": "20140813090430", "offset": 726289594, "surt": "edu,sdccd,research)/docs/accreditation/2012%20surveys/employee%20-%20briefing/mesa%20college%202012%20employee%20feedback%20survey%20briefing.pdf", "url": "http://research.sdccd.edu/docs/Accreditation/2012%20Surveys/Employee%20-%20Briefing/Mesa%20College%202012%20Employee%20Feedback%20Survey%20Briefing.pdf", "warc": "WIDE-20140813074743-crawl424/WIDE-20140813084304-09684.warc.gz"}
+sha1:23OQICQ4IBVNHJBWJX5ON3QR26KNMQNT {"c": 1, "d": "2010-06-29T00:35:17", "f": "EDG-20100628234135-01241-ia360918.us.archive.org.warc.gz", "o": 572430160, "u": "http://journalism.arizona.edu/news/rockypt6.pdf"} application/pdf {"c_size": 194706, "dt": "20100629003517", "offset": 572430160, "surt": "edu,arizona,journalism)/news/rockypt6.pdf", "url": "http://journalism.arizona.edu/news/rockypt6.pdf", "warc": "EDG-20100628214935-01235-01243-ia360918-20100629023741-00000/EDG-20100628234135-01241-ia360918.us.archive.org.warc.gz"}
+sha1:23OT2AAYPJ3Z5ZOQXVJJTVTKY6QUPICI {"c": 1, "d": "2007-02-25T17:49:01", "f": "38_0_20070225174831_crawl28.arc.gz", "o": 93868066, "u": "http://www.ece.tufts.edu:80/~hopwood/tampa-proceedings.pdf"} application/pdf {"c_size": 162157, "dt": "20070225174901", "offset": 93868066, "surt": "edu,tufts,ece)/~hopwood/tampa-proceedings.pdf", "url": "http://www.ece.tufts.edu:80/~hopwood/tampa-proceedings.pdf", "warc": "38_0_20070225173722_crawl28-c/38_0_20070225174831_crawl28.arc.gz"}
+sha1:23OUFX3ZYMF53HY4RUONR5PKN4HXN4O3 {"c": 1, "d": "2004-05-26T06:45:34", "f": "DW_crawl10.20040526064432.arc.gz", "o": 67593910, "u": "http://207.36.165.114:80/NewOrleans/Papers/1301466.pdf"} application/pdf {"c_size": 306879, "dt": "20040526064534", "offset": 67593910, "surt": "114,165,36,207)/neworleans/papers/1301466.pdf", "url": "http://207.36.165.114:80/NewOrleans/Papers/1301466.pdf", "warc": "DW_crawl10.20040525230808-c/DW_crawl10.20040526064432.arc.gz"}
+sha1:23PA23UIWCBA3CSTDK2JYX7ZIVOHULFG {"c": 1, "d": "2016-02-05T21:48:33", "f": "NLNZ-NZ-CRAWL-005-20160205211003375-02839-6291~wbgrp-crawl007.us.archive.org~8443.warc.gz", "o": 630386943, "u": "http://homepages.engineering.auckland.ac.nz/~smohan/Outreach/Docs/2013/TTU_REU2013.pdf"} application/pdf {"c_size": 2979614, "dt": "20160205214833", "offset": 630386943, "surt": "nz,ac,auckland,engineering,homepages)/~smohan/outreach/docs/2013/ttu_reu2013.pdf", "url": "http://homepages.engineering.auckland.ac.nz/~smohan/Outreach/Docs/2013/TTU_REU2013.pdf", "warc": "NLNZ-NZ-CRAWL-005-20160205211003375-02839-02848-wbgrp-crawl007/NLNZ-NZ-CRAWL-005-20160205211003375-02839-6291~wbgrp-crawl007.us.archive.org~8443.warc.gz"}
+sha1:23PGC74CTD7P6PCF3MZZZJMPYFXRK3OB {"c": 1, "d": "2005-03-17T15:05:51", "f": "EC_binary1_crawl30.20050317150502.arc.gz", "o": 75675778, "u": "http://www.csupomona.edu:80/%7Eengineering/programs/courses/aro/course_outlines/aro_407.pdf"} application/pdf {"c_size": 4842, "dt": "20050317150551", "offset": 75675778, "surt": "edu,csupomona)/~engineering/programs/courses/aro/course_outlines/aro_407.pdf", "url": "http://www.csupomona.edu:80/%7Eengineering/programs/courses/aro/course_outlines/aro_407.pdf", "warc": "EC_binary1_crawl30.20050317135651-c/EC_binary1_crawl30.20050317150502.arc.gz"}
+sha1:23PKJEQWUJAIQQSLP3GCCC5VDXN4RFCX {"c": 1, "d": "2017-10-10T23:50:37", "f": "WIDE-20171010214240-16560.warc.gz", "o": 962106404, "u": "http://www.nbrb.by/bv/articles/8997.pdf"} application/pdf {"c_size": 273375, "dt": "20171010235037", "offset": 962106404, "surt": "by,nbrb)/bv/articles/8997.pdf", "url": "http://www.nbrb.by/bv/articles/8997.pdf", "warc": "WIDE-20171010202419-crawl424/WIDE-20171010214240-16560.warc.gz"}
+sha1:23PRILJUIQUKHRYQIUYAKSBFPH53FOGT {"c": 1, "d": "2017-07-14T18:51:38", "f": "WIDE-20170714181144-06521.warc.gz", "o": 820382225, "u": "http://carsandracingstuff.com/library/articles/32538.pdf"} application/pdf {"c_size": 125426, "dt": "20170714185138", "offset": 820382225, "surt": "com,carsandracingstuff)/library/articles/32538.pdf", "url": "http://carsandracingstuff.com/library/articles/32538.pdf", "warc": "WIDE-20170714174218-crawl426/WIDE-20170714181144-06521.warc.gz"}
+sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI {"c": 1, "d": "2016-06-09T00:27:36", "f": "WIDE-20160609001810-06993.warc.gz", "o": 287880616, "u": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf"} application/pdf {"c_size": 68262, "dt": "20160609002736", "offset": 287880616, "surt": "eu,case-research)/sites/default/files/publications/18092393_e-brief_dabrowski_monetary_policy_final_0.pdf", "url": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf", "warc": "WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz"}
+sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D {"c": 1, "d": "2016-01-07T03:29:03", "f": "MUSEUM-20160107025230-02354.warc.gz", "o": 413484441, "u": "http://www.portlandoregon.gov/fire/article/363695"} application/pdf {"c_size": 44600, "dt": "20160107032903", "offset": 413484441, "surt": "gov,portlandoregon)/fire/article/363695", "url": "http://www.portlandoregon.gov/fire/article/363695", "warc": "MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz"}
+sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT {"c": 1, "d": "2014-06-07T18:00:56", "f": "ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", "o": 720590380, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en"} application/pdf {"c_size": 3727, "dt": "20140607180056", "offset": 720590380, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=264", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz"}
+sha1:23SMLYPFEGIRV6M37FJ5D364TXQXCSMR {"c": 1, "d": "2011-07-12T22:20:32", "f": "WIDE-20110712221302-03146.warc.gz", "o": 222089710, "u": "http://media.dailyuw.com/papers/_091030_7-14_color_web.pdf"} application/pdf {"c_size": 4654708, "dt": "20110712222032", "offset": 222089710, "surt": "com,dailyuw,media)/papers/_091030_7-14_color_web.pdf", "url": "http://media.dailyuw.com/papers/_091030_7-14_color_web.pdf", "warc": "WIDE-20110712221302-crawl413/WIDE-20110712221302-03146.warc.gz"}
+sha1:23SN4XBPSCRPRIHH5UAV45LFCP3VDV3V {"c": 1, "d": "2010-10-28T09:03:57", "f": "WIDE-20101028084449158-00409-23450~ia360921.us.archive.org~9443.warc.gz", "o": 756726028, "u": "http://cdacnoida.in/ASCNT-2010/Language%20Technology/Paper/Reducing%20Errors%20in%20Translation%20using%20Pre-editor%20for%20Indian%20English%20Sentences.pdf"} application/pdf {"c_size": 98408, "dt": "20101028090357", "offset": 756726028, "surt": "in,cdacnoida)/ascnt-2010/language%20technology/paper/reducing%20errors%20in%20translation%20using%20pre-editor%20for%20indian%20english%20sentences.pdf", "url": "http://cdacnoida.in/ASCNT-2010/Language%20Technology/Paper/Reducing%20Errors%20in%20Translation%20using%20Pre-editor%20for%20Indian%20English%20Sentences.pdf", "warc": "WIDE-20101028063239344-00397-00415-ia360921/WIDE-20101028084449158-00409-23450~ia360921.us.archive.org~9443.warc.gz"}
diff --git a/python_hadoop/tests/files/small.json b/python_hadoop/tests/files/small.json
new file mode 100644
index 0000000..7c75187
--- /dev/null
+++ b/python_hadoop/tests/files/small.json
@@ -0,0 +1,46 @@
+{
+ "title": "Dummy Example File",
+ "authors": [
+ {"name": "Brewster Kahle", "given_name": "Brewster", "surname": "Kahle"},
+ {"name": "J Doe", "given_name": "J", "surname": "Doe"}
+ ],
+ "journal": {
+ "name": "Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678",
+ "eissn": null,
+ "issn": null,
+ "issue": null,
+ "publisher": null,
+ "volume": null
+ },
+ "date": "2000",
+ "doi": null,
+ "citations": [
+ { "authors": [{"name": "A Seaperson", "given_name": "A", "surname": "Seaperson"}],
+ "date": "2001",
+ "id": "b0",
+ "index": 0,
+ "issue": null,
+ "journal": "Letters in the Alphabet",
+ "publisher": null,
+ "title": "Everything is Wonderful",
+ "url": null,
+ "volume": "20"},
+ { "authors": [],
+ "date": "2011-03-28",
+ "id": "b1",
+ "index": 1,
+ "issue": null,
+ "journal": "The Dictionary",
+ "publisher": null,
+ "title": "All about Facts",
+ "url": null,
+ "volume": "14"}
+ ],
+ "abstract": "Everything you ever wanted to know about nothing",
+ "body": "Introduction \nEverything starts somewhere, as somebody [1] once said. \n\n In Depth \n Meat \nYou know, for kids. \n Potatos \nQED.",
+ "acknowledgement": null,
+ "annex": null,
+ "fatcat_release": null,
+ "grobid_timestamp": "2018-04-02T00:31+0000",
+ "grobid_version": "0.5.1-SNAPSHOT"
+}
diff --git a/python_hadoop/tests/files/small.xml b/python_hadoop/tests/files/small.xml
new file mode 100644
index 0000000..78b9ba2
--- /dev/null
+++ b/python_hadoop/tests/files/small.xml
@@ -0,0 +1,110 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<TEI xmlns="http://www.tei-c.org/ns/1.0"
+xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
+xsi:schemaLocation="http://www.tei-c.org/ns/1.0 /srv/grobid/grobid-0.5.1/grobid-home/schemas/xsd/Grobid.xsd"
+ xmlns:xlink="http://www.w3.org/1999/xlink">
+ <teiHeader xml:lang="en">
+ <encodingDesc>
+ <appInfo>
+ <application version="0.5.1-SNAPSHOT" ident="GROBID" when="2018-04-02T00:31+0000">
+ <ref target="https://github.com/kermitt2/grobid">GROBID - A machine learning software for extracting information from scholarly documents</ref>
+ </application>
+ </appInfo>
+ </encodingDesc>
+ <fileDesc>
+ <titleStmt>
+ <title level="a" type="main">Dummy Example File</title>
+ </titleStmt>
+ <publicationStmt>
+ <publisher/>
+ <availability status="unknown"><licence/></availability>
+ <date type="published" when="2000">2000</date>
+ </publicationStmt>
+ <sourceDesc>
+ <biblStruct>
+ <analytic>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">Brewster</forename><surname>Kahle</surname></persName>
+ </author>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="first">J</forename><surname>Doe</surname></persName>
+ </author>
+ <author>
+ <affiliation key="aff0">
+ <orgName type="institution">Internet Archive</orgName>
+ </affiliation>
+ </author>
+ <title level="a" type="main">Dummy Example File</title>
+ </analytic>
+ <monogr>
+ <title level="m">Dummy Example File. Journal of Fake News. pp. 1-2. ISSN 1234-5678</title>
+ <imprint>
+ <date type="published" when="2000">2000</date>
+ </imprint>
+ </monogr>
+ </biblStruct>
+ </sourceDesc>
+ </fileDesc>
+ <profileDesc>
+ <textClass>
+ <keywords>
+ <term>Fake Data</term>
+ </keywords>
+ </textClass>
+ <abstract>
+ <p>Everything you ever wanted to know about nothing</p>
+ </abstract>
+ </profileDesc>
+ </teiHeader>
+ <text xml:lang="en">
+ <body>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="1">Introduction</head><p>
+Everything starts somewhere, as somebody<ref type="bibr" target="#b0">[1]</ref> once said.</p></div>
+
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2">In Depth</head></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.1">Meat</head><p>
+You know, for kids.</p></div>
+<div xmlns="http://www.tei-c.org/ns/1.0"><head n="2.2">Potatos</head><p>
+QED.</p></div>
+ </body>
+ <back>
+ <div type="references">
+
+ <listBibl>
+
+<biblStruct xml:id="b0">
+ <analytic>
+ <title level="a" type="main">Everything is Wonderful</title>
+ <author>
+ <persName xmlns="http://www.tei-c.org/ns/1.0"><forename type="middle">A</forename><surname>Seaperson</surname></persName>
+ </author>
+ </analytic>
+ <monogr>
+ <title level="j">Letters in the Alphabet</title>
+ <imprint>
+ <biblScope unit="volume">20</biblScope>
+ <biblScope unit="page" from="1" to="11" />
+ <date type="published" when="2001" />
+ </imprint>
+ </monogr>
+</biblStruct>
+
+<biblStruct xml:id="b1">
+ <analytic>
+ <title level="a" type="main">All about Facts</title>
+ </analytic>
+ <monogr>
+ <title level="j">The Dictionary</title>
+ <imprint>
+ <biblScope unit="volume">14</biblScope>
+ <date type="published" when="2011-03-28" />
+ </imprint>
+ </monogr>
+ <note>None</note>
+</biblStruct>
+
+ </listBibl>
+ </div>
+ </back>
+ </text>
+</TEI>
diff --git a/python_hadoop/tests/test_backfill_hbase_from_cdx.py b/python_hadoop/tests/test_backfill_hbase_from_cdx.py
new file mode 100644
index 0000000..070662b
--- /dev/null
+++ b/python_hadoop/tests/test_backfill_hbase_from_cdx.py
@@ -0,0 +1,74 @@
+"""
+TODO: could probably refactor to use unittest.mock.patch('happybase')
+"""
+
+import io
+import json
+import pytest
+import mrjob
+import happybase_mock
+from backfill_hbase_from_cdx import MRCDXBackfillHBase
+
+@pytest.fixture
+def job():
+ """
+ Note: this mock only seems to work with job.run_mapper(), not job.run();
+ the later results in a separate instantiation without the mock?
+ """
+ job = MRCDXBackfillHBase(['--no-conf', '-'])
+
+ conn = happybase_mock.Connection()
+ conn.create_table('wbgrp-journal-extract-test',
+ {'file': {}, 'grobid0': {}, 'f': {}})
+ job.hb_table = conn.table('wbgrp-journal-extract-test')
+
+ return job
+
+
+def test_some_lines(job):
+
+ raw = io.BytesIO(b"""
+com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1 20170705062202 http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1 application/PDF 200 MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J - - 854156 328850624 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robots.txt text/plain 200 6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD - - 638 398190140 CITESEERX-CRAWL-2017-06-20-20170705062707827-00049-00058-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705063158203-00053-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+""")
+
+ job.sandbox(stdin=raw)
+ job.run_mapper()
+
+ assert job.hb_table.row(b'1') == {}
+ # HTTP 301
+ assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
+ # valid
+ assert job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {}
+ # text/plain
+ assert job.hb_table.row(b'sha1:6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {}
+
+ row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J')
+ assert row[b'file:mime'] == b"application/pdf"
+
+ file_cdx = json.loads(row[b'file:cdx'].decode('utf-8'))
+ assert int(file_cdx['offset']) == 328850624
+
+ f_c = json.loads(row[b'f:c'].decode('utf-8'))
+ assert f_c['u'] == "http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1"
+ assert b'i' not in f_c
+
+def test_parse_cdx_skip(job):
+
+ job.mapper_init()
+
+ print("CDX prefix")
+ raw = " com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.mapper(None, raw).__next__()
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("mimetype")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf text/html 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.mapper(None, raw).__next__()
+ assert info is None
+ assert status['status'] == "skip"
+ assert 'mimetype' in status['reason']
+
diff --git a/python_hadoop/tests/test_common.py b/python_hadoop/tests/test_common.py
new file mode 100644
index 0000000..34d50ed
--- /dev/null
+++ b/python_hadoop/tests/test_common.py
@@ -0,0 +1,40 @@
+
+from common import *
+
+
+def test_parse_cdx_line():
+
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ correct = {
+ 'key': "sha1:WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G",
+ 'file:mime': "application/pdf",
+ 'file:cdx': {
+ 'surt': "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'url': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'dt': "20170828233154",
+ 'warc': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 'offset': 931661233,
+ 'c_size': 210251,
+ },
+ 'f:c': {
+ 'u': "https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf",
+ 'd': "2017-08-28T23:31:54",
+ 'f': "SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 'o': 931661233,
+ 'c': 1,
+ }
+ }
+
+ assert parse_cdx_line(raw) == correct
+ assert parse_cdx_line(raw + "\n") == correct
+ assert parse_cdx_line(raw + " extra_field") == correct
+
+def test_invalid_cdx():
+
+ print("missing warc")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 20170828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233 -"
+ assert parse_cdx_line(raw) == None
+
+ print("bad datetime")
+ raw = "edu,upenn,ldc)/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf 2070828233154 https://www.ldc.upenn.edu/sites/www.ldc.upenn.edu/files/medar2009-large-arabic-broadcast-collection.pdf application/pdf 200 WL3FEA62TEU4F52Y5DOVQ62VET4QJW7G - - 210251 931661233i SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828231135742-00000-00009-wbgrp-svc284/SEMSCHOLAR-PDF-CRAWL-2017-08-04-20170828232253025-00005-3480~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ assert parse_cdx_line(raw) == None
diff --git a/python_hadoop/tests/test_extraction_cdx_grobid.py b/python_hadoop/tests/test_extraction_cdx_grobid.py
new file mode 100644
index 0000000..471d94a
--- /dev/null
+++ b/python_hadoop/tests/test_extraction_cdx_grobid.py
@@ -0,0 +1,319 @@
+
+import io
+import json
+import mrjob
+import pytest
+import struct
+import responses
+import happybase_mock
+import wayback.exception
+from unittest import mock
+from extraction_cdx_grobid import MRExtractCdxGrobid
+
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+OK_CDX_LINE = b"""com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"""
+
+with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f:
+ REAL_TEI_XML = f.read()
+
+@pytest.fixture
+def job():
+ """
+ Note: this mock only seems to work with job.run_mapper(), not job.run();
+ the later results in a separate instantiation without the mock?
+ """
+ job = MRExtractCdxGrobid(['--no-conf', '-'])
+
+ conn = happybase_mock.Connection()
+ conn.create_table('wbgrp-journal-extract-test',
+ {'file': {}, 'grobid0': {}, 'f': {}})
+ job.hb_table = conn.table('wbgrp-journal-extract-test')
+
+ return job
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_lines(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ raw = io.BytesIO(b"""
+com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 301 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+eu,eui,cadmus)/bitstream/handle/1814/36635/rscas_2015_03.pdf;jsessionid=761393014319a39f40d32ae3eb3a853f?sequence=1 20170705062202 http://cadmus.eui.eu/bitstream/handle/1814/36635/RSCAS_2015_03.pdf%3Bjsessionid%3D761393014319A39F40D32AE3EB3A853F?sequence%3D1 application/PDF 200 MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J - - 854156 328850624 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+com,pbworks,educ333b)/robots.txt 20170705063311 http://educ333b.pbworks.com/robots.txt text/plain 200 6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD - - 638 398190140 CITESEERX-CRAWL-2017-06-20-20170705062707827-00049-00058-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705063158203-00053-31209~wbgrp-svc284.us.archive.org~8443.warc.gz
+""")
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+
+ job.run_mapper()
+
+ # for debugging tests
+ #print(output.getvalue().decode('utf-8'))
+ #print(list(job.hb_table.scan()))
+
+ # wayback gets FETCH 1x times
+ mock_fetch.assert_called_once_with(
+ "CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz",
+ 328850624,
+ 854156)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ # HBase
+ assert job.hb_table.row(b'1') == {}
+ # HTTP 301
+ assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
+ # valid
+ assert job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J') != {}
+ # text/plain
+ assert job.hb_table.row(b'sha1:6VAUYENMOU2SK2OWNRPDD6WTQTECGZAD') == {}
+
+ # Saved extraction info
+ row = job.hb_table.row(b'sha1:MPCXVWMUTRUGFP36SLPHKDLY6NGU4S3J')
+
+ assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
+ assert row[b'file:mime'] == b"application/pdf"
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ # TODO: assert row[b'grobid0:quality'] == None
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert type(status) == type(dict())
+ assert row[b'grobid0:tei_xml'].decode('utf-8') == REAL_TEI_XML
+ tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))
+ metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8'))
+ assert tei_json['title'] == metadata['title']
+ assert 'body' in tei_json
+ assert 'body' not in metadata
+
+def test_parse_cdx_invalid(job):
+
+ print("valid")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert status is None
+
+ print("space-prefixed line")
+ raw = " com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("commented line")
+ raw = "#com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("wrong column count")
+ raw = "a b c d"
+ info, status = job.parse_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+ print("missing mimetype")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf - 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert info is None
+ print(status)
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+ print("HTTP status")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+
+ print("datetime")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 501 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+
+
+def test_parse_cdx_skip(job):
+
+ job.mapper_init()
+
+ print("warc format")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.mapper(None, raw).__next__()
+ assert info is None
+ assert status['status'] == "skip"
+ assert 'WARC' in status['reason']
+
+ print("mimetype")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf text/html 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.mapper(None, raw).__next__()
+ assert info is None
+ assert status['status'] == "skip"
+ assert 'mimetype' in status['reason']
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_grobid_503(mock_fetch, job):
+
+ status = b'{"status": "done broke due to 503"}'
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=503,
+ body=status)
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ row = job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ')
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert json.loads(row[b'grobid0:status'].decode('utf-8')) == status
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_grobid_not_xml(mock_fetch, job):
+
+ payload = b'this is not XML'
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=payload)
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ output = output.getvalue().decode('utf-8')
+ row = job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ')
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ assert row[b'grobid0:tei_xml'] == payload
+ assert b'grobid0:tei_json' not in row
+ assert "XML parse error" in output
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_grobid_not_tei(mock_fetch, job):
+
+ payload = b'<xml></xml>'
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=payload)
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ output = output.getvalue().decode('utf-8')
+ row = job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ')
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ assert row[b'grobid0:tei_xml'] == payload
+ assert b'grobid0:tei_json' not in row
+ assert "non-TEI content" in output
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+def test_grobid_invalid_connection(mock_fetch, job):
+
+ status = b'{"status": "done broke"}'
+ job.options.grobid_uri = 'http://host.invalid:8070/api/processFulltextDocument'
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ output = output.getvalue().decode('utf-8')
+ assert 'error' in output
+ assert 'GROBID' in output
+ assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {}
+
+
+def test_wayback_failure(job):
+
+ job.options.warc_uri_prefix = 'http://host.invalid/'
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ output = output.getvalue().decode('utf-8')
+ assert 'error' in output
+ assert 'wayback' in output
+ assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {}
+
+
+@mock.patch('extraction_cdx_grobid.ResourceStore')
+def test_wayback_not_found(mock_rs, job):
+
+ # This is... a little convoluded. Basically creating a 404 situation for
+ # reading a wayback resource.
+ mock_resource = mock.MagicMock()
+ mock_resource.get_status.return_value = (404, "Not Found")
+ mock_rso = mock.MagicMock()
+ mock_rso.load_resource.return_value = mock_resource
+ mock_rs.return_value = mock_rso
+ print(mock_rs().load_resource().get_status())
+
+ job.options.warc_uri_prefix = 'http://dummy-archive.org/'
+
+ output = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output)
+ job.run_mapper()
+ output = output.getvalue().decode('utf-8')
+
+ print(output)
+ assert 'error' in output
+ assert 'not 200' in output
+ assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') == {}
+
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_rerun(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ output1 = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output1)
+ job.run_mapper()
+ output1 = output1.getvalue().decode('utf-8')
+
+ # wayback gets FETCH 1x times
+ assert mock_fetch.call_count == 1
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+ # HBase
+ assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') != {}
+ assert 'success' in output1
+
+ # Run again, same line
+ output2 = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output2)
+ job.run_mapper()
+ output2 = output2.getvalue().decode('utf-8')
+
+ # wayback still only FETCH 1x times
+ assert mock_fetch.call_count == 1
+ # grobid still only POST 1x times
+ assert len(responses.calls) == 1
+ assert 'existing' in output2
+
+@mock.patch('extraction_cdx_grobid.MRExtractCdxGrobid.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_previously_backfilled(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ job.hb_table.put(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ',
+ {b'f:c': b'{"some": "dict"}', b'file:col': b'bogus'})
+ assert job.hb_table.row(b'sha1:ABCDEF12345Q2MSVX7XZKYAYSCX5QBYJ') != {}
+
+ output1 = io.BytesIO()
+ job.sandbox(stdin=io.BytesIO(OK_CDX_LINE), stdout=output1)
+ job.run_mapper()
+ output1 = output1.getvalue().decode('utf-8')
+
+ # wayback gets FETCH 1x times
+ assert mock_fetch.call_count == 1
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+ assert 'success' in output1
diff --git a/python_hadoop/tests/test_extraction_ungrobided.py b/python_hadoop/tests/test_extraction_ungrobided.py
new file mode 100644
index 0000000..cb46d29
--- /dev/null
+++ b/python_hadoop/tests/test_extraction_ungrobided.py
@@ -0,0 +1,178 @@
+
+import io
+import json
+import mrjob
+import pytest
+import struct
+import responses
+import happybase_mock
+import wayback.exception
+from unittest import mock
+from common import parse_ungrobided_line
+from extraction_ungrobided import MRExtractUnGrobided
+
+
+FAKE_PDF_BYTES = b"%PDF SOME JUNK" + struct.pack("!q", 112853843)
+OK_UNGROBIDED_LINE = b"\t".join((
+ b"sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ",
+ b"""{"c": 1, "d": "2017-07-06T07:54:11", "f": "CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz", "o": 914718776, "u": "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250"}""",
+ b"application/pdf",
+ b"""{"c_size": 501, "dt": "20170706075411", "offset": 914718776, "surt": "org,ibc7)/article/file_down.php?mode=article_print&pid=250", "url": "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250", "warc": "CITESEERX-CRAWL-2017-06-20-20170706074206206-00379-00388-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz"}""",
+))
+
+with open('tests/files/23b29ea36382680716be08fc71aa81bd226e8a85.xml', 'r') as f:
+ REAL_TEI_XML = f.read()
+
+@pytest.fixture
+def job():
+ """
+ Note: this mock only seems to work with job.run_mapper(), not job.run();
+ the later results in a separate instantiation without the mock?
+ """
+ job = MRExtractUnGrobided(['--no-conf', '-'])
+
+ conn = happybase_mock.Connection()
+ conn.create_table('wbgrp-journal-extract-test',
+ {'file': {}, 'grobid0': {}, 'f': {}})
+ job.hb_table = conn.table('wbgrp-journal-extract-test')
+
+ return job
+
+
+@mock.patch('extraction_ungrobided.MRExtractUnGrobided.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_single_line(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ raw = io.BytesIO(OK_UNGROBIDED_LINE)
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+
+ job.run_mapper()
+
+ # for debugging tests
+ #print(output.getvalue().decode('utf-8'))
+ #print(list(job.hb_table.scan()))
+
+ # wayback gets FETCH 1x times
+ mock_fetch.assert_called_once_with(
+ "CITESEERX-CRAWL-2017-06-20-20170706074206206-00379-00388-wbgrp-svc285/CITESEERX-CRAWL-2017-06-20-20170706075012840-00388-3671~wbgrp-svc285.us.archive.org~8443.warc.gz",
+ 914718776,
+ 501)
+
+ # grobid gets POST 1x times
+ assert len(responses.calls) == 1
+
+ # HBase
+ assert job.hb_table.row(b'1') == {}
+
+ # Saved extraction info
+ row = job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ')
+
+ assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
+ # file:mime should actually not get clobbered by GROBID updater
+ #assert row[b'file:mime'] == b"application/pdf"
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ # TODO: assert row[b'grobid0:quality'] == None
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert type(status) == type(dict())
+ assert row[b'grobid0:tei_xml'].decode('utf-8') == REAL_TEI_XML
+ tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))
+ metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8'))
+ assert tei_json['title'] == metadata['title']
+ assert 'body' in tei_json
+ assert 'body' not in metadata
+
+@mock.patch('extraction_ungrobided.MRExtractUnGrobided.fetch_warc_content', return_value=(FAKE_PDF_BYTES, None))
+@responses.activate
+def test_mapper_lines(mock_fetch, job):
+
+ responses.add(responses.POST, 'http://localhost:8070/api/processFulltextDocument', status=200,
+ body=REAL_TEI_XML, content_type='text/xml')
+
+ raw = io.BytesIO(b"""sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI\t{"c": 1, "d": "2016-06-09T00:27:36", "f": "WIDE-20160609001810-06993.warc.gz", "o": 287880616, "u": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf"}\tapplication/pdf\t{"c_size": 68262, "dt": "20160609002736", "offset": 287880616, "surt": "eu,case-research)/sites/default/files/publications/18092393_e-brief_dabrowski_monetary_policy_final_0.pdf", "url": "http://www.case-research.eu/sites/default/files/publications/18092393_E-brief_Dabrowski_Monetary_Policy_final_0.pdf", "warc": "WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz"}
+sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D\t{"c": 1, "d": "2016-01-07T03:29:03", "f": "MUSEUM-20160107025230-02354.warc.gz", "o": 413484441, "u": "http://www.portlandoregon.gov/fire/article/363695"}\tapplication/pdf\t{"c_size": 44600, "dt": "20160107032903", "offset": 413484441, "surt": "gov,portlandoregon)/fire/article/363695", "url": "http://www.portlandoregon.gov/fire/article/363695", "warc": "MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz"}
+sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT\t{"c": 1, "d": "2014-06-07T18:00:56", "f": "ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", "o": 720590380, "u": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en"}\tapplication/pdf\t{"c_size": 3727, "dt": "20140607180056", "offset": 720590380, "surt": "edu,indiana)/~orafaq/faq/pdf.php?artlang=en&cat=36&id=264", "url": "https://www.indiana.edu/~orafaq/faq/pdf.php?cat=36&id=264&artlang=en", "warc": "ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz"}""")
+
+
+ output = io.BytesIO()
+ job.sandbox(stdin=raw, stdout=output)
+
+ job.run_mapper()
+
+ # for debugging tests
+ #print(output.getvalue().decode('utf-8'))
+ #print(list(job.hb_table.scan()))
+
+ # grobid gets POST 3x times
+ assert len(responses.calls) == 3
+
+ # wayback gets FETCH 3x times
+ mock_fetch.assert_has_calls((
+ mock.call("WIDE-20160609000312-crawl427/WIDE-20160609001810-06993.warc.gz", 287880616, 68262),
+ mock.call("MUSEUM-20160107004301-crawl891/MUSEUM-20160107025230-02354.warc.gz", 413484441, 44600),
+ mock.call("ARCHIVEIT-219-QUARTERLY-20047-00001/ARCHIVEIT-219-QUARTERLY-20047-20140607125555378-00017-wbgrp-crawl051.us.archive.org-6442.warc.gz", 720590380, 3727),
+ ))
+
+ # Saved extraction info
+ assert job.hb_table.row(b'sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ') == {}
+ assert job.hb_table.row(b'sha1:23PTUXWSNSVE4HS5J7ELDUUG63J2FPCI') != {}
+ assert job.hb_table.row(b'sha1:23PW2APYHNBPIBRIVNQ6TMKUNY53UL3D') != {}
+ assert job.hb_table.row(b'sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT') != {}
+
+ row = job.hb_table.row(b'sha1:23RJIHUIOYY5747CR6YYCTMACXDCFYTT')
+ assert struct.unpack("!q", row[b'file:size'])[0] == len(FAKE_PDF_BYTES)
+ # file:mime should actually not get clobbered by GROBID updater
+ #assert row[b'file:mime'] == b"application/pdf"
+ assert struct.unpack("!q", row[b'grobid0:status_code'])[0] == 200
+ status = json.loads(row[b'grobid0:status'].decode('utf-8'))
+ assert type(status) == type(dict())
+ assert row[b'grobid0:tei_xml'].decode('utf-8') == REAL_TEI_XML
+ tei_json = json.loads(row[b'grobid0:tei_json'].decode('utf-8'))
+ metadata = json.loads(row[b'grobid0:metadata'].decode('utf-8'))
+ assert tei_json['title'] == metadata['title']
+ assert 'body' in tei_json
+ assert 'body' not in metadata
+
+def test_parse_ungrobided_invalid(job):
+
+ print("space-prefixed line")
+ raw = " com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("commented line")
+ raw = "#com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf application/pdf 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'prefix' in status['reason']
+
+ print("wrong column count")
+ raw = "a b c d e"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+ print("CDX line, somehow")
+ raw = "com,sagepub,cep)/content/28/9/960.full.pdf 20170705062200 http://cep.sagepub.com/content/28/9/960.full.pdf - 200 3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ - - 401 313356621 CITESEERX-CRAWL-2017-06-20-20170705061647307-00039-00048-wbgrp-svc284/CITESEERX-CRAWL-2017-06-20-20170705062052659-00043-31209~wbgrp-svc284.us.archive.org~8443.warc.gz"
+ info, status = job.parse_ungrobided_line(raw)
+ assert info is None
+ print(status)
+ assert status['status'] == "invalid"
+ assert 'parse' in status['reason']
+
+def test_parse_ungrobided_valid():
+
+ parsed = parse_ungrobided_line(OK_UNGROBIDED_LINE.decode('utf-8'))
+ assert parsed['key'] == "sha1:3I42H3S6NNFQ2MSVX7XZKYAYSCX5QBYJ"
+ assert parsed['f:c']['u'] == "http://www.ibc7.org/article/file_down.php?mode%3Darticle_print%26pid%3D250"
+ assert parsed['file:mime'] == "application/pdf"
+ assert parsed['file:cdx']['c_size'] == 501
+ assert parsed['file:cdx']['dt'] == "20170706075411"
diff --git a/python_hadoop/tests/test_grobid2json.py b/python_hadoop/tests/test_grobid2json.py
new file mode 100644
index 0000000..8497b10
--- /dev/null
+++ b/python_hadoop/tests/test_grobid2json.py
@@ -0,0 +1,22 @@
+
+import xml
+import json
+import pytest
+from grobid2json import *
+
+
+def test_small_xml():
+
+ with open('tests/files/small.xml', 'r') as f:
+ tei_xml = f.read()
+ with open('tests/files/small.json', 'r') as f:
+ json_form = json.loads(f.read())
+
+ assert teixml2json(tei_xml) == json_form
+
+def test_invalid_xml():
+
+ with pytest.raises(xml.etree.ElementTree.ParseError):
+ teixml2json("this is not XML")
+ with pytest.raises(ValueError):
+ teixml2json("<xml></xml>")