aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--fatcat_scholar/identifiers.py2
-rw-r--r--fatcat_scholar/schema.py13
-rw-r--r--fatcat_scholar/search.py3
-rw-r--r--fatcat_scholar/templates/access_404.html35
-rw-r--r--fatcat_scholar/templates/search_macros.html3
-rw-r--r--fatcat_scholar/transform.py142
-rw-r--r--fatcat_scholar/web.py226
-rw-r--r--notes/scaling_works.md63
-rw-r--r--settings.toml12
-rw-r--r--tests/files/example_crossref_record.json225
-rw-r--r--tests/test_refs_transform.py60
-rw-r--r--tests/test_web.py103
12 files changed, 772 insertions, 115 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py
index 7572e20..9a64de8 100644
--- a/fatcat_scholar/identifiers.py
+++ b/fatcat_scholar/identifiers.py
@@ -27,7 +27,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]:
if not "10." in raw:
return None
if not raw.startswith("10."):
- raw = raw[raw.find("10."):]
+ raw = raw[raw.find("10.") :]
if raw[7:9] == "//":
raw = raw[:8] + raw[9:]
diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py
index e6d0422..0fcf56e 100644
--- a/fatcat_scholar/schema.py
+++ b/fatcat_scholar/schema.py
@@ -270,11 +270,12 @@ class RefBiblio(BaseModel):
volume: Optional[str]
issue: Optional[str]
pages: Optional[str]
+ version: Optional[str]
doi: Optional[str]
pmid: Optional[str]
pmcid: Optional[str]
arxiv_id: Optional[str]
- isbn13: Optional[str]
+ isbn: Optional[str]
url: Optional[str]
@@ -284,7 +285,7 @@ class RefStructured(BaseModel):
work_ident: Optional[str]
release_stage: Optional[str]
release_year: Optional[int]
- index: Optional[int]
+ index: Optional[int] # 1-indexed
key: Optional[str]
locator: Optional[str]
target_release_id: Optional[str]
@@ -300,9 +301,12 @@ class RefTarget(BaseModel):
def clean_small_int(raw: Optional[str]) -> Optional[int]:
- if not raw or not raw.isdigit():
+ if not raw or not raw.strip().isdigit():
+ return None
+ try:
+ val = int(raw.strip())
+ except ValueError:
return None
- val = int(raw)
if abs(val) > 30000:
return None
return val
@@ -317,6 +321,7 @@ def test_clean_small_int() -> None:
assert clean_small_int("1200003") == None
assert clean_small_int("-123") == None
assert clean_small_int("48844") == None
+ assert clean_small_int("1990²") == None
def doi_split_prefix(doi: str) -> str:
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py
index 121cb69..dccaf07 100644
--- a/fatcat_scholar/search.py
+++ b/fatcat_scholar/search.py
@@ -377,6 +377,9 @@ def do_fulltext_search(
search = search.params(track_total_hits=True)
search = search[offset : (offset + limit)]
+ if settings.ELASTICSEARCH_QUERY_PREFERENCE:
+ search = search.params(preference=settings.ELASTICSEARCH_QUERY_PREFERENCE)
+
query_start = datetime.datetime.now()
try:
resp = search.execute()
diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html
new file mode 100644
index 0000000..d058186
--- /dev/null
+++ b/fatcat_scholar/templates/access_404.html
@@ -0,0 +1,35 @@
+{% extends "base.html" %}
+
+{% block title %}
+404 - {{ super() }}
+{% endblock %}
+
+{% block main %}
+<div class="ui icon error message">
+ <div class="content">
+ <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div>
+ <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %}
+ <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %}
+ <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %}
+ <ul>
+ {% if original_url %}
+ <li>{% trans %}Original web url:{% endtrans %}
+ <br>
+ <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code>
+ </li>
+ <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a>
+ {% endif %}
+ {% if archiveorg_path %}
+ <li>{% trans %}archive.org download link for the item:{% endtrans %}
+ {% set archiveorg_url="https://archive.org/download" + archiveorg_path %}
+ <br>
+ <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code>
+ {% endif %}
+ {% if work_ident %}
+ <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a>
+ <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a>
+ {% endif %}
+ </ul>
+ </div>
+</div>
+{% endblock %}
diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html
index 4965045..ce50243 100644
--- a/fatcat_scholar/templates/search_macros.html
+++ b/fatcat_scholar/templates/search_macros.html
@@ -329,7 +329,7 @@
{% endif %}
{% if paper.releases|length > 1 %}
- {% for release in paper.releases if (release.ident != paper.biblio.release_ident and release.ident != paper.fulltext.release_ident) %}
+ {% for release in paper.releases if (release.ident != paper.biblio.release_ident and (not paper.fulltext or release.ident != paper.fulltext.release_ident)) %}
{% if loop.first %}
<h4 class="ui horizontal divider header">
{# <i class="tag icon"></i> #}
@@ -386,7 +386,6 @@
<div class="tag-row">
{# ### TAGS #}
{# colors to use: olive, brown, grey, pink, red, etc #}
- {# TODO: remove doc for ES 7.x-style lack of type #}
{# TODO: only show 'json' link if from cluster? #}
{% if debug_mode %}
<a target="_blank" rel="noopener" href="{{ settings.ELASTICSEARCH_PUBLIC_URL }}/{{ settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX }}/_doc/{{ paper.key }}">
diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py
index f9616c4..3a7102a 100644
--- a/fatcat_scholar/transform.py
+++ b/fatcat_scholar/transform.py
@@ -483,7 +483,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
raise NotImplementedError(f"doc_type: {heavy.doc_type}")
# TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
- if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+ if (
+ heavy.grobid_fulltext
+ and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+ ):
fulltext_release = [
r
for r in heavy.releases
@@ -603,6 +606,55 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]:
)
+def clean_ref_key(key: Optional[str], doi: Optional[str] = None) -> Optional[str]:
+ if not key:
+ return None
+ key = key.strip()
+ if key and doi and key.startswith(doi):
+ key = key.replace(doi + "-", "")
+ key = key.replace(doi, "")
+ if key.startswith("10.") and "SICI" in key and "-" in key:
+ subkey = key.split("-")[-1]
+ if subkey:
+ key = subkey
+ if key.startswith("10.") and "_" in key:
+ subkey = key.split("_")[-1]
+ if subkey:
+ key = subkey
+ if len(key) > 10 and "#" in key:
+ subkey = key.split("#")[-1]
+ if subkey:
+ key = subkey
+ if len(key) > 10 and "_" in key:
+ subkey = key.split("_")[-1]
+ if subkey:
+ key = subkey
+ if key and key.startswith("ref-"):
+ key = key[4:]
+ if len(key) >= 2 and key[0] in ["/", "_"]:
+ key = key[1:]
+ if not key:
+ return None
+ return key
+
+
+def test_clean_ref_key() -> None:
+ test_pairs = [
+ ("ref-23", None, "23"),
+ ("_bib0040", None, "bib0040"),
+ (" 20170224012016_R15", None, "R15"),
+ (
+ "10.1002/(SICI)1099-1026(199905/06)14:3<195::AID-FFJ807>3.0.CO;2-C-BIB1",
+ None,
+ "BIB1",
+ ),
+ ("BFnrcardio201557_CR175", None, "CR175"),
+ ("2019121710443552100_", None, "2019121710443552100_"),
+ ]
+ for raw, doi, expected in test_pairs:
+ assert clean_ref_key(raw, doi=doi) == expected
+
+
def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]:
output = []
for ref in tei_dict.get("citations") or []:
@@ -619,6 +671,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
if a.get("name"):
assert isinstance(a["name"], str)
authors.append(a["name"])
+ ref_index = ref.get("index")
+ if ref_index is not None:
+ # transform from 0-indexed to 1-indexed
+ ref_index = ref_index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -636,15 +692,15 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur
pmid=ref.get("pmid"),
pmcid=clean_pmcid(ref.get("pmcid")),
arxiv_id=ref.get("arxiv_id"),
- # isbn13: Optional[str]
+ isbn=ref.get("isbn"),
url=clean_url_conservative(ref.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
release_stage=release.release_stage,
release_year=release.release_year,
- index=ref.get("index"),
- key=ref.get("id"),
+ index=ref_index,
+ key=clean_ref_key(ref.get("id")),
locator=None,
# target_release_id
ref_source="grobid",
@@ -658,14 +714,6 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
for ref in release.refs:
ref_source = "fatcat"
- key = ref.key
- if key and release.ext_ids.doi and key.startswith(release.ext_ids.doi):
- key = key.replace(release.ext_ids.doi, "")
- if key and key.startswith("ref-"):
- key = key[4:]
- if key and key.startswith("b"):
- key = key[1:]
-
if release.extra and release.extra.get("pubmed"):
ref_source = "fatcat-pubmed"
elif release.extra and release.extra.get("crossref"):
@@ -676,6 +724,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
extra = ref.extra or dict()
authors = extra.get("authors") or []
authors = [a for a in authors if type(a) == str]
+ ref_index = None
+ if ref.index is not None:
+ # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs)
+ ref_index = ref.index + 1
output.append(
RefStructured(
biblio=RefBiblio(
@@ -689,18 +741,19 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]:
volume=extra.get("volume"),
issue=extra.get("issue"),
pages=extra.get("pages") or extra.get("page"),
- doi=extra.get("doi"),
+ doi=clean_doi(extra.get("doi")),
pmid=extra.get("pmid"),
- pmcid=extra.get("pmcid"),
+ pmcid=clean_pmcid(extra.get("pmcid")),
arxiv_id=extra.get("arxiv_id"),
- isbn13=extra.get("isbn13"),
+ isbn=extra.get("isbn13") or extra.get("isbn"),
url=clean_url_conservative(extra.get("url")),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
- index=ref.index,
- key=key or None,
+ index=ref_index,
+ key=clean_ref_key(ref.key, doi=release.ext_ids.doi),
locator=ref.locator,
target_release_id=ref.target_release_id,
ref_source=ref_source,
@@ -724,26 +777,41 @@ def refs_from_crossref(
authors = [
ref["author"],
]
- key = ref.get("key")
- if key and key.startswith(record["DOI"]):
- key = key.replace(record["DOI"] + "-", "")
- key = key.replace(record["DOI"], "")
- if key and key.startswith("ref-"):
- key = key[4:]
+ ref_title = ref.get("article-title")
ref_container_name = ref.get("journal-title")
if not ref_container_name:
+ ref_container_name = ref.get("container-title")
+
+ # volume-title is often a book title
+ if not ref_title:
+ ref_title = ref.get("volume-title")
+ elif not ref_container_name:
ref_container_name = ref.get("volume-title")
+
+ # series-title is a bit weird in Crossref references. it is often
+ # passed alone and seems to be the article/book title miscategorized.
+ # other times it is a conference name.
+ series_title = ref.get("series-title")
+ if not ref_title:
+ ref_title = series_title
+ elif not ref_container_name:
+ ref_container_name = series_title
+
+ year = ref.get("year")
+ if year:
+ year = clean_small_int(year)
+ else:
+ year = None
date = ref.get("date")
- year = None
- if date and len(date) >= 4 and date[:4].isdigit():
+ if date and not year and len(date) >= 4 and date[:4].isdigit():
year = int(date[:4])
- if year < 1000 or year > 2100:
- year = None
+ if year and (year < 1000 or year > 2100):
+ year = None
output.append(
RefStructured(
biblio=RefBiblio(
unstructured=ref.get("unstructured"),
- title=ref.get("article-title"),
+ title=ref_title,
subtitle=ref.get("subtitle"),
contrib_raw_names=authors,
year=year,
@@ -751,15 +819,18 @@ def refs_from_crossref(
publisher=ref.get("publisher"),
volume=ref.get("volume"),
issue=ref.get("issue"),
- pages=ref.get("page"),
- doi=ref.get("DOI"),
+ pages=ref.get("first-page"),
+ version=ref.get("edition"),
+ doi=clean_doi(ref.get("DOI")),
+ isbn=ref.get("ISBN"),
),
release_ident=release.ident,
work_ident=release.work_id,
+ release_stage=release.release_stage,
release_year=release.release_year,
- index=i,
- key=key or None,
- locator=ref.get("first-page"),
+ index=i + 1, # 1-indexed
+ key=clean_ref_key(ref.get("key"), doi=record.get("DOI")),
+ # locator,
target_release_id=None,
ref_source=ref_source,
)
@@ -795,7 +866,10 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]:
fulltext_refs: List[RefStructured] = []
# TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+
- if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq':
+ if (
+ heavy.grobid_fulltext
+ and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq"
+ ):
fulltext_release = [
r
for r in heavy.releases
diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py
index b5af18e..a705e20 100644
--- a/fatcat_scholar/web.py
+++ b/fatcat_scholar/web.py
@@ -20,6 +20,7 @@ from fastapi.responses import (
RedirectResponse,
)
from fastapi.middleware.cors import CORSMiddleware
+import fatcat_openapi_client
import sentry_sdk
from sentry_sdk.integrations.asgi import SentryAsgiMiddleware
from starlette_prometheus import metrics, PrometheusMiddleware
@@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict
return doc
-@api.get(
- "/work/{work_ident}/access/wayback/{url:path}",
- operation_id="access_redirect_wayback",
- include_in_schema=False,
-)
-def access_redirect_wayback(
- url: str,
- request: Request,
- work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
- raw_original_url = "/".join(str(request.url).split("/")[7:])
- # the quote() call is necessary because the URL is un-encoded in the path parameter
- # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
- original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
- doc_dict = get_es_scholar_doc(f"work_{work_ident}")
- if not doc_dict:
- raise HTTPException(status_code=404, detail="work not found")
- doc: ScholarDoc = doc_dict["_obj"]
- # combine fulltext with all access options
- access: List[Any] = []
- if doc.fulltext:
- access.append(doc.fulltext)
- access.extend(doc.access or [])
- for opt in access:
- if (
- opt.access_type == "wayback"
- and opt.access_url
- and "://web.archive.org/web/" in opt.access_url
- and opt.access_url.endswith(original_url)
- ):
- timestamp = opt.access_url.split("/")[4]
- if not (len(timestamp) == 14 and timestamp.isdigit()):
- continue
- access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
- return RedirectResponse(access_url, status_code=302)
- raise HTTPException(status_code=404, detail="access URL not found")
-
-
-@api.get(
- "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
- operation_id="access_redirect_ia_file",
- include_in_schema=False,
-)
-def access_redirect_ia_file(
- item: str,
- file_path: str,
- request: Request,
- work_ident: str = Query(..., min_length=20, max_length=20),
-) -> Any:
- original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
- access_url = f"https://archive.org/download/{item}/{original_path}"
- doc_dict = get_es_scholar_doc(f"work_{work_ident}")
- if not doc_dict:
- raise HTTPException(status_code=404, detail="work not found")
- doc: ScholarDoc = doc_dict["_obj"]
- # combine fulltext with all access options
- access: List[Any] = []
- if doc.fulltext:
- access.append(doc.fulltext)
- access.extend(doc.access or [])
- for opt in access:
- if opt.access_type == "ia_file" and opt.access_url == access_url:
- return RedirectResponse(access_url, status_code=302)
- raise HTTPException(status_code=404, detail="access URL not found")
-
-
web = APIRouter()
@@ -413,6 +348,165 @@ def web_work(
)
+def access_redirect_fallback(
+ request: Request,
+ work_ident: str,
+ original_url: Optional[str] = None,
+ archiveorg_path: Optional[str] = None,
+) -> Any:
+ """
+ The purpose of this helper is to catch access redirects which would
+ otherwise return a 404, and "try harder" to find a redirect.
+ """
+ # lookup against the live fatcat API, instead of scholar ES index
+ api_conf = fatcat_openapi_client.Configuration()
+ api_conf.host = settings.FATCAT_API_HOST
+ api_client = fatcat_openapi_client.DefaultApi(
+ fatcat_openapi_client.ApiClient(api_conf)
+ )
+
+ # fetch list of releases for this work from current fatcat catalog. note
+ # that these releases are not expanded (don't include file entities)
+ try:
+ # fetch work entity itself to fail fast (true 404) and handle redirects
+ work_entity = api_client.get_work(work_ident)
+ logger.warning(
+ f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}"
+ )
+ if work_entity.redirect:
+ work_ident = work_entity.redirect
+ partial_releases = api_client.get_work_releases(
+ ident=work_ident, hide="abstracts,references",
+ )
+ except fatcat_openapi_client.ApiException as ae:
+ raise HTTPException(
+ status_code=ae.status,
+ detail=f"Fatcat API call failed for work_{work_ident}",
+ )
+
+ # for each release, check for any archive.org access option with the given context
+ for partial in partial_releases:
+ release = api_client.get_release(
+ partial.ident,
+ expand="files",
+ # TODO: expand="files,filesets,webcaptures",
+ hide="abstracts,references",
+ )
+ if not release.files:
+ continue
+ for fe in release.files:
+ for url_pair in fe.urls:
+ access_url = url_pair.url
+ if (
+ original_url
+ and "://web.archive.org/web/" in access_url
+ and access_url.endswith(original_url)
+ ):
+ # TODO: test/verify this
+ timestamp = access_url.split("/")[4]
+ # if not (len(timestamp) == 14 and timestamp.isdigit()):
+ # continue
+ replay_url = (
+ f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ )
+ return RedirectResponse(replay_url, status_code=302)
+ elif (
+ archiveorg_path
+ and "://archive.org/" in access_url
+ and archiveorg_path in access_url
+ ):
+ return RedirectResponse(access_url, status_code=302)
+
+ # give up and show an error page
+ lang = LangPrefix(request)
+ return i18n_templates[lang.code].TemplateResponse(
+ "access_404.html",
+ {
+ "request": request,
+ "locale": lang.code,
+ "lang_prefix": lang.prefix,
+ "work_ident": work_ident,
+ "original_url": original_url,
+ "archiveorg_path": archiveorg_path,
+ },
+ status_code=404,
+ )
+
+
+@web.get(
+ "/work/{work_ident}/access/wayback/{url:path}",
+ operation_id="access_redirect_wayback",
+ include_in_schema=False,
+)
+def access_redirect_wayback(
+ url: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ raw_original_url = "/".join(str(request.url).split("/")[7:])
+ # the quote() call is necessary because the URL is un-encoded in the path parameter
+ # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d
+ original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",)
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ return access_redirect_fallback(
+ request, work_ident=work_ident, original_url=original_url
+ )
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if (
+ opt.access_type == "wayback"
+ and opt.access_url
+ and "://web.archive.org/web/" in opt.access_url
+ and opt.access_url.endswith(original_url)
+ ):
+ timestamp = opt.access_url.split("/")[4]
+ if not (len(timestamp) == 14 and timestamp.isdigit()):
+ continue
+ access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}"
+ return RedirectResponse(access_url, status_code=302)
+ return access_redirect_fallback(
+ request, work_ident=work_ident, original_url=original_url
+ )
+
+
+@web.get(
+ "/work/{work_ident}/access/ia_file/{item}/{file_path:path}",
+ operation_id="access_redirect_ia_file",
+ include_in_schema=False,
+)
+def access_redirect_ia_file(
+ item: str,
+ file_path: str,
+ request: Request,
+ work_ident: str = Query(..., min_length=20, max_length=20),
+) -> Any:
+ original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:]))
+ access_url = f"https://archive.org/download/{item}/{original_path}"
+ doc_dict = get_es_scholar_doc(f"work_{work_ident}")
+ if not doc_dict:
+ return access_redirect_fallback(
+ request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+ )
+ doc: ScholarDoc = doc_dict["_obj"]
+ # combine fulltext with all access options
+ access: List[Any] = []
+ if doc.fulltext:
+ access.append(doc.fulltext)
+ access.extend(doc.access or [])
+ for opt in access:
+ if opt.access_type == "ia_file" and opt.access_url == access_url:
+ return RedirectResponse(access_url, status_code=302)
+ return access_redirect_fallback(
+ request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}"
+ )
+
+
app = FastAPI(
title="Fatcat Scholar",
description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.",
diff --git a/notes/scaling_works.md b/notes/scaling_works.md
index 3b004ef..60b4597 100644
--- a/notes/scaling_works.md
+++ b/notes/scaling_works.md
@@ -657,3 +657,66 @@ So added `--compress` and the `--tmpdir` (which needed to be created):
| esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01 -type _doc \
2> /tmp/error.txt 1> /tmp/output.txt
+## 2021-06-06 Simple Iteration
+
+Some new paths, more parallelism, and more conservative file naming/handling,
+but otherwise not much changed from the 2020-12-30 run above.
+
+ export JOBDIR=/kubwa/scholar/2021-06-03
+ mkdir -p $JOBDIR
+ cd $JOBDIR
+ zcat /fast/release_export_expanded.json.gz | split --lines 8000000 - release_export_expanded.split_ -d --additional-suffix .json
+
+ cd /fast/fatcat-scholar
+ pipenv shell
+ export TMPDIR=/sandcrawler-db/tmp
+
+ # transform
+ set -u -o pipefail
+ for SHARD in {00..20}; do
+ cat $JOBDIR/release_export_expanded.split_$SHARD.json \
+ | parallel -j8 --line-buffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.work_pipeline run_releases \
+ | pv -l \
+ | pigz \
+ > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP \
+ && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz
+ done
+
+ # dump refs
+ set -u -o pipefail
+ for SHARD in {00..20}; do
+ zcat $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz \
+ | pv -l \
+ | parallel -j8 --linebuffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.transform run_refs \
+ | pigz \
+ > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP \
+ && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz
+ done
+
+Ran in to a problem with a single (!) bad TEI-XML document, due to bad text
+encoding:
+
+ xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 40, column 1122
+
+Root cause was an issue in GROBID, which seems to have been fixed in more
+recent versions of GROBID. Patched to continue, and separately commited patch
+to fatcat-scholar code base.
+
+Ran several retries, manually.
+
+Upload to petabox:
+
+ export BASENAME=scholar_corpus_bundle_2021-06-03
+ for SHARD in {00..20}; do
+ ia upload ${BASENAME}_split-${SHARD} $JOBDIR/README.md $JOBDIR/fatcat_scholar_work_fulltext.split_${SHARD}.json.gz -m collection:"scholarly-tdm" --checksum
+ done
+
+ ia upload scholar_corpus_refs_2021-06-03 fatcat_scholar_work_fulltext.split_*.refs.json.gz -m collection:"scholarly-tdm" --checksum
+
+
+### Performance Notes (on 2021-06-06 run)
+
+Recently added crossref refs via sandcrawler-db postgrest lookup. Seem to still
+be getting around 40/sec works per second, with a single thread, similar to
+previous performance, so not a significant slow down.
+
diff --git a/settings.toml b/settings.toml
index e2bc6d6..07ba1bd 100644
--- a/settings.toml
+++ b/settings.toml
@@ -5,6 +5,7 @@ SCHOLAR_ISSUEDB_PATH = "data/issue_db.sqlite"
I18N_LANG_DEFAULT = "en"
ELASTICSEARCH_QUERY_BASE = "http://localhost:9200"
ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext"
+ELASTICSEARCH_QUERY_PREFERENCE = ""
ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01"
ELASTICSEARCH_PUBLIC_URL = "http://localhost:9292"
@@ -50,14 +51,14 @@ KAFKA_BROKERS = ["localhost"]
[development-qa]
SCHOLAR_ENV = "dev"
-ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
[qa]
SCHOLAR_ENV = "qa"
-ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki"
ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"]
[prod]
@@ -65,9 +66,10 @@ SCHOLAR_ENV = "prod"
ONION_DOMAIN = "scholar.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion"
ELASTICSEARCH_QUERY_BASE = "http://localhost:9292"
ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext"
+ELASTICSEARCH_QUERY_PREFERENCE = "_local"
ELASTICSEARCH_WRITE_BASE = "http://localhost:9200"
ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01_20210128"
-ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292"
+ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki"
KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"]
ENABLE_GOATCOUNTER = true
GOATCOUNTER_ENDPOINT = "/goatcounter/count"
diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json
new file mode 100644
index 0000000..d87c7c2
--- /dev/null
+++ b/tests/files/example_crossref_record.json
@@ -0,0 +1,225 @@
+{
+ "doi": "10.1515/jpm-2019-0016",
+ "record":
+{
+ "DOI": "10.1111/his.12200",
+ "ISSN": [
+ "0309-0167"
+ ],
+ "URL": "http://dx.doi.org/10.1111/his.12200",
+ "author": [
+ {
+ "affiliation": [],
+ "family": "Stewart",
+ "given": "Colin J R"
+ }
+ ],
+ "container-title": [
+ "Histopathology"
+ ],
+ "content-domain": {
+ "crossmark-restriction": false,
+ "domain": []
+ },
+ "created": {
+ "date-parts": [
+ [
+ 2013,
+ 6,
+ 3
+ ]
+ ],
+ "date-time": "2013-06-03T16:37:56Z",
+ "timestamp": 1370277476000
+ },
+ "deposited": {
+ "date-parts": [
+ [
+ 2017,
+ 6,
+ 21
+ ]
+ ],
+ "date-time": "2017-06-21T14:04:36Z",
+ "timestamp": 1498053876000
+ },
+ "indexed": {
+ "date-parts": [
+ [
+ 2020,
+ 7,
+ 28
+ ]
+ ],
+ "date-time": "2020-07-28T14:37:55Z",
+ "timestamp": 1595947075455
+ },
+ "is-referenced-by-count": 0,
+ "issn-type": [
+ {
+ "type": "print",
+ "value": "0309-0167"
+ }
+ ],
+ "issued": {
+ "date-parts": [
+ [
+ 2013,
+ 7
+ ]
+ ]
+ },
+ "license": [
+ {
+ "URL": "http://doi.wiley.com/10.1002/tdm_license_1.1",
+ "content-version": "tdm",
+ "delay-in-days": 792,
+ "start": {
+ "date-parts": [
+ [
+ 2015,
+ 9,
+ 1
+ ]
+ ],
+ "date-time": "2015-09-01T00:00:00Z",
+ "timestamp": 1441065600000
+ }
+ }
+ ],
+ "link": [
+ {
+ "URL": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111%2Fhis.12200",
+ "content-type": "unspecified",
+ "content-version": "vor",
+ "intended-application": "text-mining"
+ }
+ ],
+ "member": "311",
+ "original-title": [],
+ "page": "n/a-n/a",
+ "prefix": "10.1111",
+ "published-online": {
+ "date-parts": [
+ [
+ 2013,
+ 7,
+ 16
+ ]
+ ]
+ },
+ "published-print": {
+ "date-parts": [
+ [
+ 2013,
+ 7
+ ]
+ ]
+ },
+ "publisher": "Wiley",
+ "reference": [
+ {
+ "DOI": "10.5858/arpa.2012-0112-RA",
+ "article-title": "The separation of benign and malignant mesothelial proliferations",
+ "author": "Churg",
+ "doi-asserted-by": "crossref",
+ "first-page": "1217",
+ "journal-title": "Arch. Pathol. Lab. Med.",
+ "key": "10.1111/his.12200-BIB0001|his12200-cit-0001",
+ "volume": "136",
+ "year": "2012"
+ },
+ {
+ "DOI": "10.1136/jcp.2010.086074",
+ "article-title": "Peritoneal mesothelial hyperplasia associated with gynaecological disease: a potential diagnostic pitfall that is commonly associated with endometriosis",
+ "author": "Opraka",
+ "doi-asserted-by": "crossref",
+ "first-page": "313",
+ "journal-title": "J. Clin. Pathol.",
+ "key": "10.1111/his.12200-BIB0002|his12200-cit-0002",
+ "volume": "64",
+ "year": "2011"
+ },
+ {
+ "DOI": "10.1038/modpathol.2012.105",
+ "article-title": "Deciduoid mesothelioma: report of 21 cases with review of the literature",
+ "author": "Ordonez",
+ "doi-asserted-by": "crossref",
+ "first-page": "1481",
+ "journal-title": "Mod. Pathol.",
+ "key": "10.1111/his.12200-BIB0003|his12200-cit-0003",
+ "volume": "25",
+ "year": "2012"
+ },
+ {
+ "DOI": "10.1111/j.1525-1438.2006.00509.x",
+ "article-title": "Atypical reactive ovarian surface epithelium, a pitfall in pathologic assessment",
+ "author": "Aydin",
+ "doi-asserted-by": "crossref",
+ "first-page": "207",
+ "issue": "Suppl. 1",
+ "journal-title": "Int. J. Gynecol. Cancer",
+ "key": "10.1111/his.12200-BIB0004|his12200-cit-0004",
+ "volume": "16",
+ "year": "2006"
+ },
+ {
+ "DOI": "10.1097/PAP.0b013e3180ca7d7b",
+ "article-title": "The pathology of endometriosis: a survey of the many faces of a common disease emphasizing diagnostic pitfalls and unusual and newly appreciated aspects",
+ "author": "Clement",
+ "doi-asserted-by": "crossref",
+ "first-page": "241",
+ "journal-title": "Adv. Anat. Pathol.",
+ "key": "10.1111/his.12200-BIB0005|his12200-cit-0005",
+ "volume": "14",
+ "year": "2007"
+ },
+ {
+ "article-title": "Extramedullary hematopoiesis associated with organizing peritoneal hemorrhage: a report of 5 cases in patients presenting with primary gynecological disorders",
+ "author": "Mesbah Ardakani",
+ "journal-title": "Int. J. Gynecol. Pathol.",
+ "key": "10.1111/his.12200-BIB0006|his12200-cit-0006"
+ },
+ {
+ "key": "10.1016/B0-12-227090-8/00204-9_bib5",
+ "series-title": "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference",
+ "year": "2001"
+ },
+ {
+ "key": "CIT0041",
+ "unstructured": "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6."
+ },
+ {
+ "author": "L Piegl",
+ "edition": "2",
+ "key": "576_CR3",
+ "unstructured": "Piegl L, Tiller W (1997) The NURBS Book, Monographs in Visual Communication, 2nd edn. Springer, Berlin",
+ "volume-title": "The NURBS Book, Monographs in Visual Communication",
+ "year": "1997"
+ }
+ ],
+ "reference-count": 6,
+ "references-count": 6,
+ "relation": {
+ "cites": []
+ },
+ "score": null,
+ "short-container-title": [
+ "Histopathology"
+ ],
+ "short-title": [],
+ "source": "Crossref",
+ "subject": [
+ "Pathology and Forensic Medicine",
+ "Histology",
+ "General Medicine"
+ ],
+ "subtitle": [],
+ "title": [
+ "Deciduoid mesothelial hyperplasia of the pelvic peritoneum"
+ ],
+ "type": "journal-article"
+},
+ "release_ident": "arzkbn5brjf2nitdy4fkiusc4q"
+}
+
diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py
index 3fa490b..078b73b 100644
--- a/tests/test_refs_transform.py
+++ b/tests/test_refs_transform.py
@@ -1,7 +1,8 @@
+import json
from fatcat_openapi_client import ReleaseEntity
from fatcat_scholar.grobid2json import teixml2json
-from fatcat_scholar.transform import refs_from_grobid
+from fatcat_scholar.transform import refs_from_grobid, refs_from_crossref
def test_transform_refs_grobid() -> None:
@@ -27,7 +28,7 @@ def test_transform_refs_grobid() -> None:
assert ref.release_year == 1234
assert ref.ref_source == "grobid"
assert ref.key == "b12"
- assert ref.index == 12
+ assert ref.index == 13
assert ref.locator == None
assert ref.biblio.contrib_raw_names is not None
assert ref.biblio.contrib_raw_names[0] == "K Tasa"
@@ -40,3 +41,58 @@ def test_transform_refs_grobid() -> None:
ref.biblio.unstructured
== "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19."
)
+
+
+def test_transform_refs_crossref() -> None:
+
+ with open("tests/files/example_crossref_record.json", "r") as f:
+ record = json.loads(f.read())
+
+ dummy_release = ReleaseEntity(
+ ident="releasedummy22222222222222",
+ work_id="workdummy22222222222222222",
+ release_year=1234,
+ release_stage="accepted",
+ ext_ids={},
+ )
+
+ refs = refs_from_crossref(dummy_release, record)
+
+ assert refs[0].release_ident == "releasedummy22222222222222"
+ assert refs[0].work_ident == "workdummy22222222222222222"
+ assert refs[0].release_stage == "accepted"
+ assert refs[0].release_year == 1234
+ assert refs[0].ref_source == "crossref"
+ assert refs[0].key == "BIB0001|his12200-cit-0001"
+ assert refs[0].index == 1
+ assert refs[0].locator is None
+ assert refs[0].biblio.contrib_raw_names is not None
+ assert refs[0].biblio.contrib_raw_names[0] == "Churg"
+ assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med."
+ assert (
+ refs[0].biblio.title
+ == "The separation of benign and malignant mesothelial proliferations"
+ )
+ assert refs[0].biblio.year == 2012
+ assert refs[0].biblio.pages == "1217"
+ assert refs[0].biblio.volume == "136"
+ assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-ra"
+ assert refs[0].biblio.unstructured is None
+
+ assert (
+ refs[6].biblio.title
+ == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference"
+ )
+ assert refs[6].biblio.year == 2001
+
+ assert refs[7].key == "CIT0041"
+ assert (
+ refs[7].biblio.unstructured
+ == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6."
+ )
+
+ assert refs[8].key == "576_CR3"
+ assert refs[8].biblio.unstructured is not None
+ assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication"
+ assert refs[8].biblio.year == 1997
+ assert refs[8].biblio.version == "2"
diff --git a/tests/test_web.py b/tests/test_web.py
index 7f1f72a..d9cfab6 100644
--- a/tests/test_web.py
+++ b/tests/test_web.py
@@ -3,6 +3,7 @@ from typing import Any
import pytest
from fastapi.testclient import TestClient
+import fatcat_openapi_client
from fatcat_scholar.web import app
@@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
== "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf"
)
- # check that URL is validated
+ # check that URL is validated (force fatcat API fallback to fail)
+ fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api")
+ fatcat_api_raw.side_effect = [
+ fatcat_openapi_client.ApiException(status=404, reason="dummy")
+ ]
rv = client.get(
"/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
allow_redirects=False,
@@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None:
assert rv.status_code == 404
+def test_access_redirect_fallback(client: Any, mocker: Any) -> None:
+
+ with open("tests/files/elastic_fulltext_get.json") as f:
+ elastic_resp = json.loads(f.read())
+
+ es_raw = mocker.patch(
+ "elasticsearch.connection.Urllib3HttpConnection.perform_request"
+ )
+ es_raw.side_effect = [
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ (200, {}, json.dumps(elastic_resp)),
+ ]
+ fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work")
+ fatcat_get_work_raw.side_effect = [
+ fatcat_openapi_client.WorkEntity(
+ state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww",
+ )
+ ] * 4
+ fatcat_get_work_releases_raw = mocker.patch(
+ "fatcat_openapi_client.DefaultApi.get_work_releases"
+ )
+ fatcat_get_work_releases_raw.side_effect = [
+ [
+ fatcat_openapi_client.ReleaseEntity(
+ ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+ ),
+ ]
+ ] * 4
+ fatcat_get_release_raw = mocker.patch(
+ "fatcat_openapi_client.DefaultApi.get_release"
+ )
+ fatcat_get_release_raw.side_effect = [
+ fatcat_openapi_client.ReleaseEntity(
+ state="active",
+ ident="rrrrrrrrrrrrrrrrrrrrrrrrrr",
+ ext_ids=fatcat_openapi_client.ReleaseExtIds(),
+ files=[
+ fatcat_openapi_client.FileEntity(
+ ident="ffffffffffffffffffffffffff",
+ urls=[
+ fatcat_openapi_client.FileUrl(
+ rel="web", url="https://blarg.example.com",
+ ),
+ fatcat_openapi_client.FileUrl(
+ rel="webarchive",
+ url="https://web.archive.org/web/12345/https://example.com",
+ ),
+ fatcat_openapi_client.FileUrl(
+ rel="archive",
+ url="https://archive.org/download/some/thing.pdf",
+ ),
+ ],
+ ),
+ ],
+ )
+ ] * 4
+
+ # redirects should work after API lookup, for both wayback and archive.org
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert (
+ rv.headers["Location"]
+ == "https://web.archive.org/web/12345id_/https://example.com"
+ )
+
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 302
+ assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf"
+
+ # wrong URLs should still not work, but display a page with helpful links
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 404
+ assert b"Access Location Not Found" in rv.content
+ assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content
+
+ rv = client.get(
+ "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf",
+ allow_redirects=False,
+ )
+ assert rv.status_code == 404
+ assert b"Access Location Not Found" in rv.content
+ assert b"archive.org/download/some/thing.else.pdf" in rv.content
+
+
def test_access_redirect_encoding(client: Any, mocker: Any) -> None:
with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: