{
  "abstracts": [
    {
      "content": "The majority of text is stored in UTF-8, which must be validated on\ningestion. We present the lookup algorithm, which outperforms UTF-8 validation\nroutines used in many libraries and languages by more than 10 times using\ncommonly available SIMD instructions. To ensure reproducibility, our work is\nfreely available as open source software.",
      "lang": "en",
      "mimetype": "text/plain",
      "sha1": "bb3b47ddb92b73cd378bd18ebee600472e817274"
    }
  ],
  "contribs": [
    {
      "index": 0,
      "raw_name": "John Keiser",
      "role": "author"
    },
    {
      "index": 1,
      "raw_name": "Daniel Lemire",
      "role": "author"
    }
  ],
  "ext_ids": {
    "arxiv": "2010.03090v2"
  },
  "extra": {
    "arxiv": {
      "base_id": "2010.03090",
      "categories": [
        "cs.DB"
      ]
    }
  },
  "ident": "5a22nt42bvfj7m3dzfm7br73ni",
  "language": "en",
  "license_slug": "CC-BY",
  "refs": [],
  "release_date": "2020-10-10",
  "release_stage": "submitted",
  "release_type": "article",
  "release_year": 2020,
  "revision": "305afdf5-ae10-49bf-a740-ea1740390afb",
  "state": "active",
  "title": "Validating UTF-8 In Less Than One Instruction Per Byte",
  "version": "v2",
  "work_id": "pgixwot2knfmtfq5pwjr4cazf4"
}