12 files changed, 772 insertions, 115 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
index 7572e20..9a64de8 100644
--- a/fatcat_scholar/identifiers.py
+++ b/fatcat_scholar/identifiers.py
@@ -27,7 +27,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
     if not "10." in raw:
         return None
     if not raw.startswith("10."):
-        raw = raw[raw.find("10."):]
+        raw = raw[raw.find("10.") :]
     if raw[7:9] == "//":
         raw = raw[:8] + raw[9:]
 
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index e6d0422..0fcf56e 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -270,11 +270,12 @@ class RefBiblio(BaseModel):
     volume: Optional[str]
     issue: Optional[str]
     pages: Optional[str]
+    version: Optional[str]
     doi: Optional[str]
     pmid: Optional[str]
     pmcid: Optional[str]
     arxiv_id: Optional[str]
-    isbn13: Optional[str]
+    isbn: Optional[str]
     url: Optional[str]
 
 
@@ -284,7 +285,7 @@ class RefStructured(BaseModel):
     work_ident: Optional[str]
     release_stage: Optional[str]
     release_year: Optional[int]
-    index: Optional[int]
+    index: Optional[int]  # 1-indexed
     key: Optional[str]
     locator: Optional[str]
     target_release_id: Optional[str]
@@ -300,9 +301,12 @@ class RefTarget(BaseModel):
 
 
 def clean_small_int(raw: Optional[str]) -> Optional[int]:
-    if not raw or not raw.isdigit():
+    if not raw or not raw.strip().isdigit():
+        return None
+    try:
+        val = int(raw.strip())
+    except ValueError:
         return None
-    val = int(raw)
     if abs(val) > 30000:
         return None
     return val
@@ -317,6 +321,7 @@ def test_clean_small_int() -> None:
     assert clean_small_int("1200003") == None
     assert clean_small_int("-123") == None
     assert clean_small_int("48844") == None
+    assert clean_small_int("1990²") == None
 
 
 def doi_split_prefix(doi: str) -> str:
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 121cb69..dccaf07 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -377,6 +377,9 @@ def do_fulltext_search(
     search = search.params(track_total_hits=True)
     search = search[offset : (offset + limit)]
 
+    if settings.ELASTICSEARCH_QUERY_PREFERENCE:
+        search = search.params(preference=settings.ELASTICSEARCH_QUERY_PREFERENCE)
+
     query_start = datetime.datetime.now()
     try:
         resp = search.execute()
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html
new file mode 100644
index 0000000..d058186
--- /dev/null
+++ b/fatcat_scholar/templates/access_404.html
@@ -0,0 +1,35 @@
+{% extends "base.html" %}
+
+{% block title %}
+404 - {{ super() }}
+{% endblock %}
+
+{% block main %}
+<div class="ui icon error message">
+  <div class="content">
+    <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div>
+    <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %}
+    <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %}
+    <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %}
+    <ul>
+      {% if original_url %}
+        <li>{% trans %}Original web url:{% endtrans %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code>
+        </li>
+        <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a>
+      {% endif %}
+      {% if archiveorg_path %}
+        <li>{% trans %}archive.org download link for the item:{% endtrans %}
+          {% set archiveorg_url="https://archive.org/download" + archiveorg_path %}
+          <br>
+          <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code>
+      {% endif %}
+      {% if work_ident %}
+        <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a>
+        <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a>
+      {% endif %}
+    </ul>
+  </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index 4965045..ce50243 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -329,7 +329,7 @@
   {% endif %}
 
   {% if paper.releases|length > 1 %}
-    {% for release in paper.releases if (release.ident != paper.biblio.release_ident and release.ident != paper.fulltext.release_ident) %}
+    {% for release in paper.releases if (release.ident != paper.biblio.release_ident and (not paper.fulltext or release.ident != paper.fulltext.release_ident)) %}
       {% if loop.first %}
         <h4 class="ui horizontal divider header">
           {# <i class="tag icon"></i> #}
@@ -386,7 +386,6 @@
   <div class="tag-row">
   {# ### TAGS #}
     {# colors to use: olive, brown, grey, pink, red, etc #}
-    {# TODO: remove doc for ES 7.x-style lack of type #}
     {# TODO: only show 'json' link if from cluster? #}
     {% if debug_mode %}
       <a target="_blank" rel="noopener" href="{{ settings.ELASTICSEARCH_PUBLIC_URL }}/{{ settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX }}/_doc/{{ paper.key }}">
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index f9616c4..3a7102a 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -483,7 +483,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
         raise NotImplementedError(f"doc_type: {heavy.doc_type}")
 
     # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
-    if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+    if (
+        heavy.grobid_fulltext
+        and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+    ):
         fulltext_release = [
             r
             for r in heavy.releases
@@ -603,6 +606,55 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
     )
 
 
+def clean_ref_key(key: Optional[str], doi: Optional[str] = None) -> Optional[str]:
+    if not key:
+        return None
+    key = key.strip()
+    if key and doi and key.startswith(doi):
+        key = key.replace(doi + "-", "")
+        key = key.replace(doi, "")
+    if key.startswith("10.") and "SICI" in key and "-" in key:
+        subkey = key.split("-")[-1]
+        if subkey:
+            key = subkey
+    if key.startswith("10.") and "_" in key:
+        subkey = key.split("_")[-1]
+        if subkey:
+            key = subkey
+    if len(key) > 10 and "#" in key:
+        subkey = key.split("#")[-1]
+        if subkey:
+            key = subkey
+    if len(key) > 10 and "_" in key:
+        subkey = key.split("_")[-1]
+        if subkey:
+            key = subkey
+    if key and key.startswith("ref-"):
+        key = key[4:]
+    if len(key) >= 2 and key[0] in ["/", "_"]:
+        key = key[1:]
+    if not key:
+        return None
+    return key
+
+
+def test_clean_ref_key() -> None:
+    test_pairs = [
+        ("ref-23", None, "23"),
+        ("_bib0040", None, "bib0040"),
+        ("                                20170224012016_R15", None, "R15"),
+        (
+            "10.1002/(SICI)1099-1026(199905/06)14:3<195::AID-FFJ807>3.0.CO;2-C-BIB1",
+            None,
+            "BIB1",
+        ),
+        ("BFnrcardio201557_CR175", None, "CR175"),
+        ("2019121710443552100_", None, "2019121710443552100_"),
+    ]
+    for raw, doi, expected in test_pairs:
+        assert clean_ref_key(raw, doi=doi) == expected
+
+
 def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]:
     output = []
     for ref in tei_dict.get("citations") or []:
@@ -619,6 +671,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
                 if a.get("name"):
                     assert isinstance(a["name"], str)
                     authors.append(a["name"])
+        ref_index = ref.get("index")
+        if ref_index is not None:
+            # transform from 0-indexed to 1-indexed
+            ref_index = ref_index + 1
         output.append(
             RefStructured(
                 biblio=RefBiblio(
@@ -636,15 +692,15 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
                     pmid=ref.get("pmid"),
                     pmcid=clean_pmcid(ref.get("pmcid")),
                     arxiv_id=ref.get("arxiv_id"),
-                    # isbn13: Optional[str]
+                    isbn=ref.get("isbn"),
                     url=clean_url_conservative(ref.get("url")),
                 ),
                 release_ident=release.ident,
                 work_ident=release.work_id,
                 release_stage=release.release_stage,
                 release_year=release.release_year,
-                index=ref.get("index"),
-                key=ref.get("id"),
+                index=ref_index,
+                key=clean_ref_key(ref.get("id")),
                 locator=None,
                 # target_release_id
                 ref_source="grobid",
@@ -658,14 +714,6 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
     for ref in release.refs:
         ref_source = "fatcat"
 
-        key = ref.key
-        if key and release.ext_ids.doi and key.startswith(release.ext_ids.doi):
-            key = key.replace(release.ext_ids.doi, "")
-        if key and key.startswith("ref-"):
-            key = key[4:]
-        if key and key.startswith("b"):
-            key = key[1:]
-
         if release.extra and release.extra.get("pubmed"):
             ref_source = "fatcat-pubmed"
         elif release.extra and release.extra.get("crossref"):
@@ -676,6 +724,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
         extra = ref.extra or dict()
         authors = extra.get("authors") or []
         authors = [a for a in authors if type(a) == str]
+        ref_index = None
+        if ref.index is not None:
+            # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs)
+            ref_index = ref.index + 1
         output.append(
             RefStructured(
                 biblio=RefBiblio(
@@ -689,18 +741,19 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
                     volume=extra.get("volume"),
                     issue=extra.get("issue"),
                     pages=extra.get("pages") or extra.get("page"),
-                    doi=extra.get("doi"),
+                    doi=clean_doi(extra.get("doi")),
                     pmid=extra.get("pmid"),
-                    pmcid=extra.get("pmcid"),
+                    pmcid=clean_pmcid(extra.get("pmcid")),
                     arxiv_id=extra.get("arxiv_id"),
-                    isbn13=extra.get("isbn13"),
+                    isbn=extra.get("isbn13") or extra.get("isbn"),
                     url=clean_url_conservative(extra.get("url")),
                 ),
                 release_ident=release.ident,
                 work_ident=release.work_id,
+                release_stage=release.release_stage,
                 release_year=release.release_year,
-                index=ref.index,
-                key=key or None,
+                index=ref_index,
+                key=clean_ref_key(ref.key, doi=release.ext_ids.doi),
                 locator=ref.locator,
                 target_release_id=ref.target_release_id,
                 ref_source=ref_source,
@@ -724,26 +777,41 @@ def refs_from_crossref(
             authors = [
                 ref["author"],
             ]
-        key = ref.get("key")
-        if key and key.startswith(record["DOI"]):
-            key = key.replace(record["DOI"] + "-", "")
-            key = key.replace(record["DOI"], "")
-        if key and key.startswith("ref-"):
-            key = key[4:]
+        ref_title = ref.get("article-title")
         ref_container_name = ref.get("journal-title")
         if not ref_container_name:
+            ref_container_name = ref.get("container-title")
+
+        # volume-title is often a book title
+        if not ref_title:
+            ref_title = ref.get("volume-title")
+        elif not ref_container_name:
             ref_container_name = ref.get("volume-title")
+
+        # series-title is a bit weird in Crossref references. it is often
+        # passed alone and seems to be the article/book title miscategorized.
+        # other times it is a conference name.
+        series_title = ref.get("series-title")
+        if not ref_title:
+            ref_title = series_title
+        elif not ref_container_name:
+            ref_container_name = series_title
+
+        year = ref.get("year")
+        if year:
+            year = clean_small_int(year)
+        else:
+            year = None
         date = ref.get("date")
-        year = None
-        if date and len(date) >= 4 and date[:4].isdigit():
+        if date and not year and len(date) >= 4 and date[:4].isdigit():
             year = int(date[:4])
-            if year < 1000 or year > 2100:
-                year = None
+        if year and (year < 1000 or year > 2100):
+            year = None
         output.append(
             RefStructured(
                 biblio=RefBiblio(
                     unstructured=ref.get("unstructured"),
-                    title=ref.get("article-title"),
+                    title=ref_title,
                     subtitle=ref.get("subtitle"),
                     contrib_raw_names=authors,
                     year=year,
@@ -751,15 +819,18 @@ def refs_from_crossref(
                     publisher=ref.get("publisher"),
                     volume=ref.get("volume"),
                     issue=ref.get("issue"),
-                    pages=ref.get("page"),
-                    doi=ref.get("DOI"),
+                    pages=ref.get("first-page"),
+                    version=ref.get("edition"),
+                    doi=clean_doi(ref.get("DOI")),
+                    isbn=ref.get("ISBN"),
                 ),
                 release_ident=release.ident,
                 work_ident=release.work_id,
+                release_stage=release.release_stage,
                 release_year=release.release_year,
-                index=i,
-                key=key or None,
-                locator=ref.get("first-page"),
+                index=i + 1,  # 1-indexed
+                key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
+                # locator,
                 target_release_id=None,
                 ref_source=ref_source,
             )
@@ -795,7 +866,10 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
 
     fulltext_refs: List[RefStructured] = []
     # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
-    if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+    if (
+        heavy.grobid_fulltext
+        and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+    ):
         fulltext_release = [
             r
             for r in heavy.releases
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index b5af18e..a705e20 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -20,6 +20,7 @@ from fastapi.responses import (
     RedirectResponse,
 )
 from fastapi.middleware.cors import CORSMiddleware
+import fatcat_openapi_client
 import sentry_sdk
 from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
 from starlette_prometheus import metrics, PrometheusMiddleware
@@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
     return doc
 
 
-@api.get(
-    "/work/{work_ident}/access/wayback/{url:path}",
-    operation_id="access_redirect_wayback",
-    include_in_schema=False,
-)
-def access_redirect_wayback(
-    url: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    raw_original_url = "/".join(str(request.url).split("/")[7:])
-    # the quote() call is necessary because the URL is un-encoded in the path parameter
-    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
-    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if (
-            opt.access_type == "wayback"
-            and opt.access_url
-            and "://web.archive.org/web/" in opt.access_url
-            and opt.access_url.endswith(original_url)
-        ):
-            timestamp = opt.access_url.split("/")[4]
-            if not (len(timestamp) == 14 and timestamp.isdigit()):
-                continue
-            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
-@api.get(
-    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
-    operation_id="access_redirect_ia_file",
-    include_in_schema=False,
-)
-def access_redirect_ia_file(
-    item: str,
-    file_path: str,
-    request: Request,
-    work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
-    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
-    access_url = f"https://archive.org/download/{item}/{original_path}"
-    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
-    if not doc_dict:
-        raise HTTPException(status_code=404, detail="work not found")
-    doc: ScholarDoc = doc_dict["_obj"]
-    # combine fulltext with all access options
-    access: List[Any] = []
-    if doc.fulltext:
-        access.append(doc.fulltext)
-    access.extend(doc.access or [])
-    for opt in access:
-        if opt.access_type == "ia_file" and opt.access_url == access_url:
-            return RedirectResponse(access_url, status_code=302)
-    raise HTTPException(status_code=404, detail="access URL not found")
-
-
 web = APIRouter()
 
 
@@ -413,6 +348,165 @@ def web_work(
     )
 
 
+def access_redirect_fallback(
+    request: Request,
+    work_ident: str,
+    original_url: Optional[str] = None,
+    archiveorg_path: Optional[str] = None,
+) -> Any:
+    """
+    The purpose of this helper is to catch access redirects which would
+    otherwise return a 404, and "try harder" to find a redirect.
+    """
+    # lookup against the live fatcat API, instead of scholar ES index
+    api_conf = fatcat_openapi_client.Configuration()
+    api_conf.host = settings.FATCAT_API_HOST
+    api_client = fatcat_openapi_client.DefaultApi(
+        fatcat_openapi_client.ApiClient(api_conf)
+    )
+
+    # fetch list of releases for this work from current fatcat catalog. note
+    # that these releases are not expanded (don't include file entities)
+    try:
+        # fetch work entity itself to fail fast (true 404) and handle redirects
+        work_entity = api_client.get_work(work_ident)
+        logger.warning(
+            f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}"
+        )
+        if work_entity.redirect:
+            work_ident = work_entity.redirect
+        partial_releases = api_client.get_work_releases(
+            ident=work_ident, hide="abstracts,references",
+        )
+    except fatcat_openapi_client.ApiException as ae:
+        raise HTTPException(
+            status_code=ae.status,
+            detail=f"Fatcat API call failed for work_{work_ident}",
+        )
+
+    # for each release, check for any archive.org access option with the given context
+    for partial in partial_releases:
+        release = api_client.get_release(
+            partial.ident,
+            expand="files",
+            # TODO: expand="files,filesets,webcaptures",
+            hide="abstracts,references",
+        )
+        if not release.files:
+            continue
+        for fe in release.files:
+            for url_pair in fe.urls:
+                access_url = url_pair.url
+                if (
+                    original_url
+                    and "://web.archive.org/web/" in access_url
+                    and access_url.endswith(original_url)
+                ):
+                    # TODO: test/verify this
+                    timestamp = access_url.split("/")[4]
+                    # if not (len(timestamp) == 14 and timestamp.isdigit()):
+                    #    continue
+                    replay_url = (
+                        f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+                    )
+                    return RedirectResponse(replay_url, status_code=302)
+                elif (
+                    archiveorg_path
+                    and "://archive.org/" in access_url
+                    and archiveorg_path in access_url
+                ):
+                    return RedirectResponse(access_url, status_code=302)
+
+    # give up and show an error page
+    lang = LangPrefix(request)
+    return i18n_templates[lang.code].TemplateResponse(
+        "access_404.html",
+        {
+            "request": request,
+            "locale": lang.code,
+            "lang_prefix": lang.prefix,
+            "work_ident": work_ident,
+            "original_url": original_url,
+            "archiveorg_path": archiveorg_path,
+        },
+        status_code=404,
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/wayback/{url:path}",
+    operation_id="access_redirect_wayback",
+    include_in_schema=False,
+)
+def access_redirect_wayback(
+    url: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    raw_original_url = "/".join(str(request.url).split("/")[7:])
+    # the quote() call is necessary because the URL is un-encoded in the path parameter
+    # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
+    original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, original_url=original_url
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if (
+            opt.access_type == "wayback"
+            and opt.access_url
+            and "://web.archive.org/web/" in opt.access_url
+            and opt.access_url.endswith(original_url)
+        ):
+            timestamp = opt.access_url.split("/")[4]
+            if not (len(timestamp) == 14 and timestamp.isdigit()):
+                continue
+            access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, original_url=original_url
+    )
+
+
+@web.get(
+    "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
+    operation_id="access_redirect_ia_file",
+    include_in_schema=False,
+)
+def access_redirect_ia_file(
+    item: str,
+    file_path: str,
+    request: Request,
+    work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+    original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
+    access_url = f"https://archive.org/download/{item}/{original_path}"
+    doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+    if not doc_dict:
+        return access_redirect_fallback(
+            request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+        )
+    doc: ScholarDoc = doc_dict["_obj"]
+    # combine fulltext with all access options
+    access: List[Any] = []
+    if doc.fulltext:
+        access.append(doc.fulltext)
+    access.extend(doc.access or [])
+    for opt in access:
+        if opt.access_type == "ia_file" and opt.access_url == access_url:
+            return RedirectResponse(access_url, status_code=302)
+    return access_redirect_fallback(
+        request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+    )
+
+
 app = FastAPI(
     title="Fatcat Scholar",
     description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
diff --git a/notes/scaling_works.md b/notes/scaling_works.md
index 3b004ef..60b4597 100644
--- a/notes/scaling_works.md
+++ b/notes/scaling_works.md
@@ -657,3 +657,66 @@ So added `--compress` and the `--tmpdir` (which needed to be created):
       | esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01 -type _doc \
       2> /tmp/error.txt 1> /tmp/output.txt
 
+## 2021-06-06 Simple Iteration
+
+Some new paths, more parallelism, and more conservative file naming/handling,
+but otherwise not much changed from the 2020-12-30 run above.
+
+    export JOBDIR=/kubwa/scholar/2021-06-03
+    mkdir -p $JOBDIR
+    cd $JOBDIR
+    zcat /fast/release_export_expanded.json.gz | split --lines 8000000 - release_export_expanded.split_ -d --additional-suffix .json
+
+    cd /fast/fatcat-scholar
+    pipenv shell
+    export TMPDIR=/sandcrawler-db/tmp
+
+    # transform
+    set -u -o pipefail
+    for SHARD in {00..20}; do
+        cat $JOBDIR/release_export_expanded.split_$SHARD.json \
+            | parallel -j8 --line-buffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.work_pipeline run_releases \
+            | pv -l \
+            | pigz \
+            > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP \
+            && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz
+    done
+
+    # dump refs
+    set -u -o pipefail
+    for SHARD in {00..20}; do
+        zcat $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz \
+            | pv -l \
+            | parallel -j8 --linebuffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.transform run_refs \
+            | pigz \
+            > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP \
+            && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz
+    done
+
+Ran in to a problem with a single (!) bad TEI-XML document, due to bad text
+encoding:
+
+    xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 40, column 1122
+
+Root cause was an issue in GROBID, which seems to have been fixed in more
+recent versions of GROBID. Patched to continue, and separately commited patch
+to fatcat-scholar code base.
+
+Ran several retries, manually.
+
+Upload to petabox:
+
+    export BASENAME=scholar_corpus_bundle_2021-06-03
+    for SHARD in {00..20}; do
+        ia upload ${BASENAME}_split-${SHARD} $JOBDIR/README.md $JOBDIR/fatcat_scholar_work_fulltext.split_${SHARD}.json.gz -m collection:"scholarly-tdm" --checksum
+    done
+
+    ia upload scholar_corpus_refs_2021-06-03 fatcat_scholar_work_fulltext.split_*.refs.json.gz -m collection:"scholarly-tdm" --checksum
+
+
+### Performance Notes (on 2021-06-06 run)
+
+Recently added crossref refs via sandcrawler-db postgrest lookup. Seem to still
+be getting around 40/sec works per second, with a single thread, similar to
+previous performance, so not a significant slow down.
+
diff --git a/settings.toml b/settings.toml
index e2bc6d6..07ba1bd 100644
--- a/settings.toml
+++ b/settings.toml
@@ -5,6 +5,7 @@ SCHOLAR_ISSUEDB_PATH = "data/issue_db.sqlite"
 I18N_LANG_DEFAULT = "en"
 ELASTICSEARCH_QUERY_BASE = "http://localhost:9200"
 ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext"
+ELASTICSEARCH_QUERY_PREFERENCE = ""
 ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
 ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01"
 ELASTICSEARCH_PUBLIC_URL = "http://localhost:9292"
@@ -50,14 +51,14 @@ KAFKA_BROKERS = ["localhost"]
 
 [development-qa]
 SCHOLAR_ENV = "dev"
-ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
 
 [qa]
 SCHOLAR_ENV = "qa"
-ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki"
 ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
 KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"]
 
 [prod]
@@ -65,9 +66,10 @@ SCHOLAR_ENV = "prod"
 ONION_DOMAIN = "scholar.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion"
 ELASTICSEARCH_QUERY_BASE = "http://localhost:9292"
 ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext"
+ELASTICSEARCH_QUERY_PREFERENCE = "_local"
 ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
 ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01_20210128"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
 KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"]
 ENABLE_GOATCOUNTER = true
 GOATCOUNTER_ENDPOINT = "/goatcounter/count"
diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json
new file mode 100644
index 0000000..d87c7c2
--- /dev/null
+++ b/tests/files/example_crossref_record.json
@@ -0,0 +1,225 @@
+{
+  "doi": "10.1515/jpm-2019-0016",
+  "record":
+{
+  "DOI": "10.1111/his.12200",
+  "ISSN": [
+    "0309-0167"
+  ],
+  "URL": "http://dx.doi.org/10.1111/his.12200",
+  "author": [
+    {
+      "affiliation": [],
+      "family": "Stewart",
+      "given": "Colin J R"
+    }
+  ],
+  "container-title": [
+    "Histopathology"
+  ],
+  "content-domain": {
+    "crossmark-restriction": false,
+    "domain": []
+  },
+  "created": {
+    "date-parts": [
+      [
+        2013,
+        6,
+        3
+      ]
+    ],
+    "date-time": "2013-06-03T16:37:56Z",
+    "timestamp": 1370277476000
+  },
+  "deposited": {
+    "date-parts": [
+      [
+        2017,
+        6,
+        21
+      ]
+    ],
+    "date-time": "2017-06-21T14:04:36Z",
+    "timestamp": 1498053876000
+  },
+  "indexed": {
+    "date-parts": [
+      [
+        2020,
+        7,
+        28
+      ]
+    ],
+    "date-time": "2020-07-28T14:37:55Z",
+    "timestamp": 1595947075455
+  },
+  "is-referenced-by-count": 0,
+  "issn-type": [
+    {
+      "type": "print",
+      "value": "0309-0167"
+    }
+  ],
+  "issued": {
+    "date-parts": [
+      [
+        2013,
+        7
+      ]
+    ]
+  },
+  "license": [
+    {
+      "URL": "http://doi.wiley.com/10.1002/tdm_license_1.1",
+      "content-version": "tdm",
+      "delay-in-days": 792,
+      "start": {
+        "date-parts": [
+          [
+            2015,
+            9,
+            1
+          ]
+        ],
+        "date-time": "2015-09-01T00:00:00Z",
+        "timestamp": 1441065600000
+      }
+    }
+  ],
+  "link": [
+    {
+      "URL": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111%2Fhis.12200",
+      "content-type": "unspecified",
+      "content-version": "vor",
+      "intended-application": "text-mining"
+    }
+  ],
+  "member": "311",
+  "original-title": [],
+  "page": "n/a-n/a",
+  "prefix": "10.1111",
+  "published-online": {
+    "date-parts": [
+      [
+        2013,
+        7,
+        16
+      ]
+    ]
+  },
+  "published-print": {
+    "date-parts": [
+      [
+        2013,
+        7
+      ]
+    ]
+  },
+  "publisher": "Wiley",
+  "reference": [
+    {
+      "DOI": "10.5858/arpa.2012-0112-RA",
+      "article-title": "The separation of benign and malignant mesothelial proliferations",
+      "author": "Churg",
+      "doi-asserted-by": "crossref",
+      "first-page": "1217",
+      "journal-title": "Arch. Pathol. Lab. Med.",
+      "key": "10.1111/his.12200-BIB0001|his12200-cit-0001",
+      "volume": "136",
+      "year": "2012"
+    },
+    {
+      "DOI": "10.1136/jcp.2010.086074",
+      "article-title": "Peritoneal mesothelial hyperplasia associated with gynaecological disease: a potential diagnostic pitfall that is commonly associated with endometriosis",
+      "author": "Opraka",
+      "doi-asserted-by": "crossref",
+      "first-page": "313",
+      "journal-title": "J. Clin. Pathol.",
+      "key": "10.1111/his.12200-BIB0002|his12200-cit-0002",
+      "volume": "64",
+      "year": "2011"
+    },
+    {
+      "DOI": "10.1038/modpathol.2012.105",
+      "article-title": "Deciduoid mesothelioma: report of 21 cases with review of the literature",
+      "author": "Ordonez",
+      "doi-asserted-by": "crossref",
+      "first-page": "1481",
+      "journal-title": "Mod. Pathol.",
+      "key": "10.1111/his.12200-BIB0003|his12200-cit-0003",
+      "volume": "25",
+      "year": "2012"
+    },
+    {
+      "DOI": "10.1111/j.1525-1438.2006.00509.x",
+      "article-title": "Atypical reactive ovarian surface epithelium, a pitfall in pathologic assessment",
+      "author": "Aydin",
+      "doi-asserted-by": "crossref",
+      "first-page": "207",
+      "issue": "Suppl. 1",
+      "journal-title": "Int. J. Gynecol. Cancer",
+      "key": "10.1111/his.12200-BIB0004|his12200-cit-0004",
+      "volume": "16",
+      "year": "2006"
+    },
+    {
+      "DOI": "10.1097/PAP.0b013e3180ca7d7b",
+      "article-title": "The pathology of endometriosis: a survey of the many faces of a common disease emphasizing diagnostic pitfalls and unusual and newly appreciated aspects",
+      "author": "Clement",
+      "doi-asserted-by": "crossref",
+      "first-page": "241",
+      "journal-title": "Adv. Anat. Pathol.",
+      "key": "10.1111/his.12200-BIB0005|his12200-cit-0005",
+      "volume": "14",
+      "year": "2007"
+    },
+    {
+      "article-title": "Extramedullary hematopoiesis associated with organizing peritoneal hemorrhage: a report of 5 cases in patients presenting with primary gynecological disorders",
+      "author": "Mesbah Ardakani",
+      "journal-title": "Int. J. Gynecol. Pathol.",
+      "key": "10.1111/his.12200-BIB0006|his12200-cit-0006"
+    },
+    {
+      "key": "10.1016/B0-12-227090-8/00204-9_bib5",
+      "series-title": "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference",
+      "year": "2001"
+    },
+    {
+      "key": "CIT0041",
+      "unstructured": "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6."
+    },
+    {
+      "author": "L Piegl",
+      "edition": "2",
+      "key": "576_CR3",
+      "unstructured": "Piegl L, Tiller W (1997) The NURBS Book, Monographs in Visual Communication, 2nd edn. Springer, Berlin",
+      "volume-title": "The NURBS Book, Monographs in Visual Communication",
+      "year": "1997"
+    }
+  ],
+  "reference-count": 6,
+  "references-count": 6,
+  "relation": {
+    "cites": []
+  },
+  "score": null,
+  "short-container-title": [
+    "Histopathology"
+  ],
+  "short-title": [],
+  "source": "Crossref",
+  "subject": [
+    "Pathology and Forensic Medicine",
+    "Histology",
+    "General Medicine"
+  ],
+  "subtitle": [],
+  "title": [
+    "Deciduoid mesothelial hyperplasia of the pelvic peritoneum"
+  ],
+  "type": "journal-article"
+},
+  "release_ident": "arzkbn5brjf2nitdy4fkiusc4q"
+}
+
diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py
index 3fa490b..078b73b 100644
--- a/tests/test_refs_transform.py
+++ b/tests/test_refs_transform.py
@@ -1,7 +1,8 @@
+import json
 from fatcat_openapi_client import ReleaseEntity
 
 from fatcat_scholar.grobid2json import teixml2json
-from fatcat_scholar.transform import refs_from_grobid
+from fatcat_scholar.transform import refs_from_grobid, refs_from_crossref
 
 
 def test_transform_refs_grobid() -> None:
@@ -27,7 +28,7 @@ def test_transform_refs_grobid() -> None:
     assert ref.release_year == 1234
     assert ref.ref_source == "grobid"
     assert ref.key == "b12"
-    assert ref.index == 12
+    assert ref.index == 13
     assert ref.locator == None
     assert ref.biblio.contrib_raw_names is not None
     assert ref.biblio.contrib_raw_names[0] == "K Tasa"
@@ -40,3 +41,58 @@ def test_transform_refs_grobid() -> None:
         ref.biblio.unstructured
         == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
     )
+
+
+def test_transform_refs_crossref() -> None:
+
+    with open("tests/files/example_crossref_record.json", "r") as f:
+        record = json.loads(f.read())
+
+    dummy_release = ReleaseEntity(
+        ident="releasedummy22222222222222",
+        work_id="workdummy22222222222222222",
+        release_year=1234,
+        release_stage="accepted",
+        ext_ids={},
+    )
+
+    refs = refs_from_crossref(dummy_release, record)
+
+    assert refs[0].release_ident == "releasedummy22222222222222"
+    assert refs[0].work_ident == "workdummy22222222222222222"
+    assert refs[0].release_stage == "accepted"
+    assert refs[0].release_year == 1234
+    assert refs[0].ref_source == "crossref"
+    assert refs[0].key == "BIB0001|his12200-cit-0001"
+    assert refs[0].index == 1
+    assert refs[0].locator is None
+    assert refs[0].biblio.contrib_raw_names is not None
+    assert refs[0].biblio.contrib_raw_names[0] == "Churg"
+    assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med."
+    assert (
+        refs[0].biblio.title
+        == "The separation of benign and malignant mesothelial proliferations"
+    )
+    assert refs[0].biblio.year == 2012
+    assert refs[0].biblio.pages == "1217"
+    assert refs[0].biblio.volume == "136"
+    assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-ra"
+    assert refs[0].biblio.unstructured is None
+
+    assert (
+        refs[6].biblio.title
+        == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference"
+    )
+    assert refs[6].biblio.year == 2001
+
+    assert refs[7].key == "CIT0041"
+    assert (
+        refs[7].biblio.unstructured
+        == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6."
+    )
+
+    assert refs[8].key == "576_CR3"
+    assert refs[8].biblio.unstructured is not None
+    assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication"
+    assert refs[8].biblio.year == 1997
+    assert refs[8].biblio.version == "2"
diff --git a/tests/test_web.py b/tests/test_web.py
index 7f1f72a..d9cfab6 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -3,6 +3,7 @@ from typing import Any
 
 import pytest
 from fastapi.testclient import TestClient
+import fatcat_openapi_client
 
 from fatcat_scholar.web import app
 
@@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
         == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
     )
 
-    # check that URL is validated
+    # check that URL is validated (force fatcat API fallback to fail)
+    fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api")
+    fatcat_api_raw.side_effect = [
+        fatcat_openapi_client.ApiException(status=404, reason="dummy")
+    ]
     rv = client.get(
         "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
         allow_redirects=False,
@@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
     assert rv.status_code == 404
 
 
+def test_access_redirect_fallback(client: Any, mocker: Any) -> None:
+
+    with open("tests/files/elastic_fulltext_get.json") as f:
+        elastic_resp = json.loads(f.read())
+
+    es_raw = mocker.patch(
+        "elasticsearch.connection.Urllib3HttpConnection.perform_request"
+    )
+    es_raw.side_effect = [
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+        (200, {}, json.dumps(elastic_resp)),
+    ]
+    fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
+    fatcat_get_work_raw.side_effect = [
+        fatcat_openapi_client.WorkEntity(
+            state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
+        )
+    ] * 4
+    fatcat_get_work_releases_raw = mocker.patch(
+        "fatcat_openapi_client.DefaultApi.get_work_releases"
+    )
+    fatcat_get_work_releases_raw.side_effect = [
+        [
+            fatcat_openapi_client.ReleaseEntity(
+                ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+                ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+            ),
+        ]
+    ] * 4
+    fatcat_get_release_raw = mocker.patch(
+        "fatcat_openapi_client.DefaultApi.get_release"
+    )
+    fatcat_get_release_raw.side_effect = [
+        fatcat_openapi_client.ReleaseEntity(
+            state="active",
+            ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+            ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+            files=[
+                fatcat_openapi_client.FileEntity(
+                    ident="ffffffffffffffffffffffffff",
+                    urls=[
+                        fatcat_openapi_client.FileUrl(
+                            rel="web", url="https://blarg.example.com",
+                        ),
+                        fatcat_openapi_client.FileUrl(
+                            rel="webarchive",
+                            url="https://web.archive.org/web/12345/https://example.com",
+                        ),
+                        fatcat_openapi_client.FileUrl(
+                            rel="archive",
+                            url="https://archive.org/download/some/thing.pdf",
+                        ),
+                    ],
+                ),
+            ],
+        )
+    ] * 4
+
+    # redirects should work after API lookup, for both wayback and archive.org
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert (
+        rv.headers["Location"]
+        == "https://web.archive.org/web/12345id_/https://example.com"
+    )
+
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 302
+    assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"
+
+    # wrong URLs should still not work, but display a page with helpful links
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 404
+    assert b"Access Location Not Found" in rv.content
+    assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content
+
+    rv = client.get(
+        "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
+        allow_redirects=False,
+    )
+    assert rv.status_code == 404
+    assert b"Access Location Not Found" in rv.content
+    assert b"archive.org/download/some/thing.else.pdf" in rv.content
+
+
 def test_access_redirect_encoding(client: Any, mocker: Any) -> None:
 
     with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: