aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--tests/data/release/5a22nt42bvfj7m3dzfm7br73ni46
-rw-r--r--tests/data/release/63ht2plao5c4dasjeqj7vwglmq46
-rw-r--r--tests/data/release/zchlsocreffbdm36qhzy3cs32e287
-rw-r--r--tests/data/verify.csv5
4 files changed, 383 insertions, 1 deletions
diff --git a/tests/data/release/5a22nt42bvfj7m3dzfm7br73ni b/tests/data/release/5a22nt42bvfj7m3dzfm7br73ni
new file mode 100644
index 0000000..dbd6354
--- /dev/null
+++ b/tests/data/release/5a22nt42bvfj7m3dzfm7br73ni
@@ -0,0 +1,46 @@
+{
+ "abstracts": [
+ {
+ "content": "The majority of text is stored in UTF-8, which must be validated on\ningestion. We present the lookup algorithm, which outperforms UTF-8 validation\nroutines used in many libraries and languages by more than 10 times using\ncommonly available SIMD instructions. To ensure reproducibility, our work is\nfreely available as open source software.",
+ "lang": "en",
+ "mimetype": "text/plain",
+ "sha1": "bb3b47ddb92b73cd378bd18ebee600472e817274"
+ }
+ ],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "John Keiser",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Daniel Lemire",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "arxiv": "2010.03090v2"
+ },
+ "extra": {
+ "arxiv": {
+ "base_id": "2010.03090",
+ "categories": [
+ "cs.DB"
+ ]
+ }
+ },
+ "ident": "5a22nt42bvfj7m3dzfm7br73ni",
+ "language": "en",
+ "license_slug": "CC-BY",
+ "refs": [],
+ "release_date": "2020-10-10",
+ "release_stage": "submitted",
+ "release_type": "article",
+ "release_year": 2020,
+ "revision": "305afdf5-ae10-49bf-a740-ea1740390afb",
+ "state": "active",
+ "title": "Validating UTF-8 In Less Than One Instruction Per Byte",
+ "version": "v2",
+ "work_id": "pgixwot2knfmtfq5pwjr4cazf4"
+}
diff --git a/tests/data/release/63ht2plao5c4dasjeqj7vwglmq b/tests/data/release/63ht2plao5c4dasjeqj7vwglmq
new file mode 100644
index 0000000..41888ac
--- /dev/null
+++ b/tests/data/release/63ht2plao5c4dasjeqj7vwglmq
@@ -0,0 +1,46 @@
+{
+ "abstracts": [
+ {
+ "content": "The majority of text is stored in UTF-8, which must be validated on\ningestion. We present the lookup algorithm, which outperforms UTF-8 validation\nroutines used in many libraries and languages by more than 10 times using\ncommonly available SIMD instructions. To ensure reproducibility, our work is\nfreely available as open source software.",
+ "lang": "en",
+ "mimetype": "text/plain",
+ "sha1": "bb3b47ddb92b73cd378bd18ebee600472e817274"
+ }
+ ],
+ "contribs": [
+ {
+ "index": 0,
+ "raw_name": "John Keiser",
+ "role": "author"
+ },
+ {
+ "index": 1,
+ "raw_name": "Daniel Lemire",
+ "role": "author"
+ }
+ ],
+ "ext_ids": {
+ "arxiv": "2010.03090v1"
+ },
+ "extra": {
+ "arxiv": {
+ "base_id": "2010.03090",
+ "categories": [
+ "cs.DB"
+ ]
+ }
+ },
+ "ident": "63ht2plao5c4dasjeqj7vwglmq",
+ "language": "en",
+ "license_slug": "CC-BY",
+ "refs": [],
+ "release_date": "2020-10-06",
+ "release_stage": "submitted",
+ "release_type": "article",
+ "release_year": 2020,
+ "revision": "a66120cf-3c39-48da-8d90-f63513840bab",
+ "state": "active",
+ "title": "Validating UTF-8 In Less Than One Instruction Per Byte",
+ "version": "v1",
+ "work_id": "pgixwot2knfmtfq5pwjr4cazf4"
+}
diff --git a/tests/data/release/zchlsocreffbdm36qhzy3cs32e b/tests/data/release/zchlsocreffbdm36qhzy3cs32e
new file mode 100644
index 0000000..9007c19
--- /dev/null
+++ b/tests/data/release/zchlsocreffbdm36qhzy3cs32e
@@ -0,0 +1,287 @@
+{
+ "abstracts": [],
+ "container_id": "afve5b3tavbfzch46mdtazbqla",
+ "contribs": [
+ {
+ "extra": {
+ "seq": "first"
+ },
+ "given_name": "John",
+ "index": 0,
+ "raw_affiliation": "Microsoft Redmond Washington USA",
+ "raw_name": "John Keiser",
+ "role": "author",
+ "surname": "Keiser"
+ },
+ {
+ "creator_id": "d7rv3jglprcd5nloqpfonppiom",
+ "given_name": "Daniel",
+ "index": 1,
+ "raw_affiliation": "DOT‐Lab Research Center Université du Québec (TELUQ) Montreal Quebec Canada",
+ "raw_name": "Daniel Lemire",
+ "role": "author",
+ "surname": "Lemire"
+ }
+ ],
+ "ext_ids": {
+ "doi": "10.1002/spe.2920"
+ },
+ "extra": {
+ "crossref": {
+ "alternative-id": [
+ "10.1002/spe.2920"
+ ],
+ "archive": [
+ "Portico"
+ ],
+ "funder": [
+ {
+ "DOI": "10.13039/501100000046",
+ "award": [
+ "RGPIN‐2017‐03910"
+ ],
+ "doi-asserted-by": "publisher",
+ "name": "National Research Council Canada"
+ }
+ ],
+ "license": [
+ {
+ "URL": "http://onlinelibrary.wiley.com/termsAndConditions#vor",
+ "content-version": "vor",
+ "delay-in-days": 0,
+ "start": "2020-10-29T00:00:00Z"
+ }
+ ],
+ "subject": [
+ "Software"
+ ],
+ "type": "journal-article"
+ }
+ },
+ "ident": "zchlsocreffbdm36qhzy3cs32e",
+ "language": "en",
+ "publisher": "Wiley",
+ "refs": [
+ {
+ "extra": {
+ "unstructured": "YergeauF UTF‐8 a transformation format of ISO 10646; internet engineering task force request for comments 3629;2015.https://tools.ietf.org/html/rfc3629. Accessed July 2020."
+ },
+ "index": 0,
+ "key": "e_1_2_11_2_1"
+ },
+ {
+ "extra": {
+ "unstructured": "The MITRE CorporationCAPEC‐80: using UTF‐8 encoding to bypass validation logic;2019.https://capec.mitre.org/data/definitions/80.html. Accessed July 2020."
+ },
+ "index": 1,
+ "key": "e_1_2_11_3_1"
+ },
+ {
+ "extra": {
+ "unstructured": "ColletY. LZ4 ‐ Extremely fast compression;2020.https://github.com/lz4/lz4. Accessed July 2020."
+ },
+ "index": 2,
+ "key": "e_1_2_11_4_1"
+ },
+ {
+ "container_name": "IEEE Softw",
+ "extra": {
+ "authors": [
+ "Suneja N"
+ ],
+ "doi": "10.1109/ms.2019.2909854",
+ "volume": "36"
+ },
+ "index": 3,
+ "key": "e_1_2_11_5_1",
+ "locator": "96",
+ "title": "Scylladb optimizes database architecture to maximize hardware performance",
+ "year": 2019
+ },
+ {
+ "extra": {
+ "unstructured": "CaiY. Utils: optimize UTF‐8 validation;2019.https://bit.ly/2VrlQ37. Accessed July 2020."
+ },
+ "index": 4,
+ "key": "e_1_2_11_6_1"
+ },
+ {
+ "container_name": "2012 SC Companion: High Performance Computing, Networking Storage and Analysis",
+ "extra": {
+ "authors": [
+ "Cebrián JM"
+ ],
+ "doi": "10.1109/sc.companion.2012.93",
+ "volume-title": "2012 SC Companion: High Performance Computing, Networking Storage and Analysis"
+ },
+ "index": 5,
+ "key": "e_1_2_11_7_1",
+ "locator": "675",
+ "year": 2012
+ },
+ {
+ "extra": {
+ "doi": "10.1145/1133255.1133997"
+ },
+ "index": 6,
+ "key": "e_1_2_11_8_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1109/iceccs.2013.40",
+ "unstructured": "XiaX LoD ZhuF WangX ZhouB. Software internationalization and localization: an industrial experience. Paper presented at: Proceedings of the 2013 18th International Conference on Engineering of Complex Computer Systems Singapore;2013:222‐231."
+ },
+ "index": 7,
+ "key": "e_1_2_11_9_1"
+ },
+ {
+ "container_name": "IITM J Manag IT",
+ "extra": {
+ "authors": [
+ "Singh T"
+ ],
+ "volume": "10"
+ },
+ "index": 8,
+ "key": "e_1_2_11_10_1",
+ "locator": "65",
+ "title": "Fuchsia OS‐a threat to android",
+ "year": 2019
+ },
+ {
+ "extra": {
+ "unstructured": "HöhrmannB Flexible and economical UTF‐8 decoder;2010.http://bjoern.hoehrmann.de/utf‐8/decoder/dfa/. Accessed July 2020."
+ },
+ "index": 9,
+ "key": "e_1_2_11_11_1"
+ },
+ {
+ "container_name": "VLDB J",
+ "extra": {
+ "authors": [
+ "Langdale G"
+ ],
+ "doi": "10.1007/s00778-019-00578-5",
+ "volume": "28"
+ },
+ "index": 10,
+ "key": "e_1_2_11_12_1",
+ "locator": "941",
+ "title": "Parsing gigabytes of JSON per second",
+ "year": 2019
+ },
+ {
+ "extra": {
+ "doi": "10.1007/978-1-4684-2001-2_9"
+ },
+ "index": 11,
+ "key": "e_1_2_11_13_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1145/2807591.2807644",
+ "unstructured": "HoeflerT BelliR. Scientific benchmarking of parallel computing systems: twelve ways to tell the masses when reporting performance results. Paper presented at: Proceedings of the International Conference for High Performance Computing Networking Storage and Analysis Austin Texas;2015:1‐12."
+ },
+ "index": 12,
+ "key": "e_1_2_11_14_1"
+ },
+ {
+ "container_name": "ACM Trans Web",
+ "extra": {
+ "authors": [
+ "Muła W"
+ ],
+ "doi": "10.1145/3132709",
+ "volume": "12"
+ },
+ "index": 13,
+ "key": "e_1_2_11_15_1",
+ "title": "Faster Base64 encoding and decoding using AVX2 instructions",
+ "year": 2018
+ },
+ {
+ "extra": {
+ "doi": "10.1002/spe.2777"
+ },
+ "index": 14,
+ "key": "e_1_2_11_16_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1145/1463788.1463811",
+ "unstructured": "CameronRD HerdyKS LinD. High performance XML parsing using parallel bit stream technology. Paper presented at: Proceedings of the 2008 Conference of the Center for Advanced Studies on Collaborative Research: Meeting of Minds CASCON '08;2008:17:222–17:235; New York NY ACM."
+ },
+ "index": 15,
+ "key": "e_1_2_11_17_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1145/2541940.2541988",
+ "unstructured": "MytkowiczT MusuvathiM SchulteW. Data‐parallel finite‐state machines. Paper presented at: Proceedings of the 19th International Conference on Architectural Support for Programming Languages and Operating Systems ASPLOS '14;2014:529‐542; New York NY ACM."
+ },
+ "index": 16,
+ "key": "e_1_2_11_18_1"
+ },
+ {
+ "container_name": "Proc VLDB Endow",
+ "extra": {
+ "authors": [
+ "Mühlbauer T"
+ ],
+ "doi": "10.14778/2556549.2556555",
+ "issue": "14"
+ },
+ "index": 17,
+ "key": "e_1_2_11_19_1",
+ "locator": "1702",
+ "title": "Instant loading for main memory databases",
+ "year": 2013
+ },
+ {
+ "extra": {
+ "doi": "10.1145/1345206.1345222",
+ "unstructured": "CameronRD. A case study in SIMD text processing with parallel bit streams: UTF‐8 to UTF‐16 transcoding. Paper presented at: Proceedings of the 13th ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming;2008:91‐98; ACM New York NY."
+ },
+ "index": 18,
+ "key": "e_1_2_11_20_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1145/3018743.3018760",
+ "unstructured": "JiangP AgrawalG.Combining SIMD and Many/Multi‐core parallelism for finite state machines with enumerative speculation. Paper presented at: Proceedings of the 22nd ACM SIGPLAN Symposium on Principles and Practice of Parallel Programming Austin Texas;2017:179‐191."
+ },
+ "index": 19,
+ "key": "e_1_2_11_21_1"
+ },
+ {
+ "extra": {
+ "doi": "10.1109/mm.2017.35"
+ },
+ "index": 20,
+ "key": "e_1_2_11_22_1"
+ },
+ {
+ "container_name": "Perform Eval",
+ "extra": {
+ "authors": [
+ "Pohl A"
+ ],
+ "doi": "10.1016/j.peva.2020.102106",
+ "volume": "140"
+ },
+ "index": 21,
+ "key": "e_1_2_11_23_1",
+ "locator": "102106",
+ "title": "Vectorization cost modeling for NEON, AVX and SVE",
+ "year": 2020
+ }
+ ],
+ "release_date": "2020-10-29",
+ "release_stage": "published",
+ "release_type": "article-journal",
+ "release_year": 2020,
+ "revision": "4da1f7f7-f39e-4487-aca2-8626b9009497",
+ "state": "active",
+ "title": "Validating UTF‐8 in less than one instruction per byte",
+ "work_id": "izbh6u37vzb2fauftewcwdxi4m"
+}
diff --git a/tests/data/verify.csv b/tests/data/verify.csv
index c4d5172..af05113 100644
--- a/tests/data/verify.csv
+++ b/tests/data/verify.csv
@@ -117,4 +117,7 @@ vqjpcuqxnbhdtelzspxjmklm7u,knuzh5bcqbg7ph7ffvqaiwevti,Status.AMBIGUOUS,Miss.CUST
psykbwxylndtdaand2ymtkgzqu,xizkwvsodzajnn4u7lgeldqoum,Status.DIFFERENT,Miss.YEAR
nb4yakyqebalbatnnfijkfhmka,pr7e4l5eibaavm3zsk62nmphni,Status.STRONG,OK.SLUG_TITLE_AUTHOR_MATCH
in2mm2wafbczjgzlapq55rrksq,oaezupjwnfckxaajjhjb3fl42e,Status.AMBIGUOUS,
-u4mjilmo75bcnjyms564l66jea,6ofr4mqnmrdy3nyyh5ufm5ats4,Status.EXACT,
+u4mjilmo75bcnjyms564l66jea,6ofr4mqnmrdy3nyyh5ufm5ats4,Status.AMBIGUOUS,
+5a22nt42bvfj7m3dzfm7br73ni,63ht2plao5c4dasjeqj7vwglmq,Status.EXACT,OK.WORK_ID
+5a22nt42bvfj7m3dzfm7br73ni,zchlsocreffbdm36qhzy3cs32e,Status.STRONG,
+63ht2plao5c4dasjeqj7vwglmq,zchlsocreffbdm36qhzy3cs32e,Status.STRONG,