diff options
-rw-r--r-- | fatcat_scholar/identifiers.py | 2 | ||||
-rw-r--r-- | fatcat_scholar/schema.py | 13 | ||||
-rw-r--r-- | fatcat_scholar/search.py | 3 | ||||
-rw-r--r-- | fatcat_scholar/templates/access_404.html | 35 | ||||
-rw-r--r-- | fatcat_scholar/templates/search_macros.html | 3 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 142 | ||||
-rw-r--r-- | fatcat_scholar/web.py | 226 | ||||
-rw-r--r-- | notes/scaling_works.md | 63 | ||||
-rw-r--r-- | settings.toml | 12 | ||||
-rw-r--r-- | tests/files/example_crossref_record.json | 225 | ||||
-rw-r--r-- | tests/test_refs_transform.py | 60 | ||||
-rw-r--r-- | tests/test_web.py | 103 |
12 files changed, 772 insertions, 115 deletions
diff --git a/fatcat_scholar/identifiers.py b/fatcat_scholar/identifiers.py index 7572e20..9a64de8 100644 --- a/fatcat_scholar/identifiers.py +++ b/fatcat_scholar/identifiers.py @@ -27,7 +27,7 @@ def clean_doi(raw: Optional[str]) -> Optional[str]: if not "10." in raw: return None if not raw.startswith("10."): - raw = raw[raw.find("10."):] + raw = raw[raw.find("10.") :] if raw[7:9] == "//": raw = raw[:8] + raw[9:] diff --git a/fatcat_scholar/schema.py b/fatcat_scholar/schema.py index e6d0422..0fcf56e 100644 --- a/fatcat_scholar/schema.py +++ b/fatcat_scholar/schema.py @@ -270,11 +270,12 @@ class RefBiblio(BaseModel): volume: Optional[str] issue: Optional[str] pages: Optional[str] + version: Optional[str] doi: Optional[str] pmid: Optional[str] pmcid: Optional[str] arxiv_id: Optional[str] - isbn13: Optional[str] + isbn: Optional[str] url: Optional[str] @@ -284,7 +285,7 @@ class RefStructured(BaseModel): work_ident: Optional[str] release_stage: Optional[str] release_year: Optional[int] - index: Optional[int] + index: Optional[int] # 1-indexed key: Optional[str] locator: Optional[str] target_release_id: Optional[str] @@ -300,9 +301,12 @@ class RefTarget(BaseModel): def clean_small_int(raw: Optional[str]) -> Optional[int]: - if not raw or not raw.isdigit(): + if not raw or not raw.strip().isdigit(): + return None + try: + val = int(raw.strip()) + except ValueError: return None - val = int(raw) if abs(val) > 30000: return None return val @@ -317,6 +321,7 @@ def test_clean_small_int() -> None: assert clean_small_int("1200003") == None assert clean_small_int("-123") == None assert clean_small_int("48844") == None + assert clean_small_int("1990²") == None def doi_split_prefix(doi: str) -> str: diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 121cb69..dccaf07 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -377,6 +377,9 @@ def do_fulltext_search( search = search.params(track_total_hits=True) search = search[offset : (offset + limit)] + if settings.ELASTICSEARCH_QUERY_PREFERENCE: + search = search.params(preference=settings.ELASTICSEARCH_QUERY_PREFERENCE) + query_start = datetime.datetime.now() try: resp = search.execute() diff --git a/fatcat_scholar/templates/access_404.html b/fatcat_scholar/templates/access_404.html new file mode 100644 index 0000000..d058186 --- /dev/null +++ b/fatcat_scholar/templates/access_404.html @@ -0,0 +1,35 @@ +{% extends "base.html" %} + +{% block title %} +404 - {{ super() }} +{% endblock %} + +{% block main %} +<div class="ui icon error message"> + <div class="content"> + <div class="header">{% trans %}404: Access Location Not Found{% endtrans %}</div> + <p>{% trans %}We could not find a valid redirect for the URL you tried. Sorry about that!{% endtrans %} + <p>{% trans %}There may be a typo, truncation, or encoding error. Or, the resource may have been removed from our catalog.{% endtrans %} + <p>{% trans %}Some places you can visit try to hunt down this resource (or a replacement) include:{% endtrans %} + <ul> + {% if original_url %} + <li>{% trans %}Original web url:{% endtrans %} + <br> + <code style="word-break: break-all;"><a href="{{ original_url }}">{{ original_url }}</a></code> + </li> + <li><a href="https://web.archive.org/web/*/{{ original_url }}">{% trans %}Wayback Machine calendar page (all captures){% endtrans %}</a> + {% endif %} + {% if archiveorg_path %} + <li>{% trans %}archive.org download link for the item:{% endtrans %} + {% set archiveorg_url="https://archive.org/download" + archiveorg_path %} + <br> + <code style="word-break: break-all;"><a href="{{ archiveorg_url }}">{{ archiveorg_url }}</a></code> + {% endif %} + {% if work_ident %} + <li><a href="/work/{{ work_ident }}">{% trans %}Scholar landing page{% endtrans %}</a> + <li><a href="https://fatcat.wiki/work/{{ work_ident }}">{% trans %}Fatcat catalog page{% endtrans %}</a> + {% endif %} + </ul> + </div> +</div> +{% endblock %} diff --git a/fatcat_scholar/templates/search_macros.html b/fatcat_scholar/templates/search_macros.html index 4965045..ce50243 100644 --- a/fatcat_scholar/templates/search_macros.html +++ b/fatcat_scholar/templates/search_macros.html @@ -329,7 +329,7 @@ {% endif %} {% if paper.releases|length > 1 %} - {% for release in paper.releases if (release.ident != paper.biblio.release_ident and release.ident != paper.fulltext.release_ident) %} + {% for release in paper.releases if (release.ident != paper.biblio.release_ident and (not paper.fulltext or release.ident != paper.fulltext.release_ident)) %} {% if loop.first %} <h4 class="ui horizontal divider header"> {# <i class="tag icon"></i> #} @@ -386,7 +386,6 @@ <div class="tag-row"> {# ### TAGS #} {# colors to use: olive, brown, grey, pink, red, etc #} - {# TODO: remove doc for ES 7.x-style lack of type #} {# TODO: only show 'json' link if from cluster? #} {% if debug_mode %} <a target="_blank" rel="noopener" href="{{ settings.ELASTICSEARCH_PUBLIC_URL }}/{{ settings.ELASTICSEARCH_QUERY_FULLTEXT_INDEX }}/_doc/{{ paper.key }}"> diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index f9616c4..3a7102a 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -483,7 +483,10 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: raise NotImplementedError(f"doc_type: {heavy.doc_type}") # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ - if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': + if ( + heavy.grobid_fulltext + and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" + ): fulltext_release = [ r for r in heavy.releases @@ -603,6 +606,55 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: ) +def clean_ref_key(key: Optional[str], doi: Optional[str] = None) -> Optional[str]: + if not key: + return None + key = key.strip() + if key and doi and key.startswith(doi): + key = key.replace(doi + "-", "") + key = key.replace(doi, "") + if key.startswith("10.") and "SICI" in key and "-" in key: + subkey = key.split("-")[-1] + if subkey: + key = subkey + if key.startswith("10.") and "_" in key: + subkey = key.split("_")[-1] + if subkey: + key = subkey + if len(key) > 10 and "#" in key: + subkey = key.split("#")[-1] + if subkey: + key = subkey + if len(key) > 10 and "_" in key: + subkey = key.split("_")[-1] + if subkey: + key = subkey + if key and key.startswith("ref-"): + key = key[4:] + if len(key) >= 2 and key[0] in ["/", "_"]: + key = key[1:] + if not key: + return None + return key + + +def test_clean_ref_key() -> None: + test_pairs = [ + ("ref-23", None, "23"), + ("_bib0040", None, "bib0040"), + (" 20170224012016_R15", None, "R15"), + ( + "10.1002/(SICI)1099-1026(199905/06)14:3<195::AID-FFJ807>3.0.CO;2-C-BIB1", + None, + "BIB1", + ), + ("BFnrcardio201557_CR175", None, "CR175"), + ("2019121710443552100_", None, "2019121710443552100_"), + ] + for raw, doi, expected in test_pairs: + assert clean_ref_key(raw, doi=doi) == expected + + def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructured]: output = [] for ref in tei_dict.get("citations") or []: @@ -619,6 +671,10 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur if a.get("name"): assert isinstance(a["name"], str) authors.append(a["name"]) + ref_index = ref.get("index") + if ref_index is not None: + # transform from 0-indexed to 1-indexed + ref_index = ref_index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -636,15 +692,15 @@ def refs_from_grobid(release: ReleaseEntity, tei_dict: dict) -> List[RefStructur pmid=ref.get("pmid"), pmcid=clean_pmcid(ref.get("pmcid")), arxiv_id=ref.get("arxiv_id"), - # isbn13: Optional[str] + isbn=ref.get("isbn"), url=clean_url_conservative(ref.get("url")), ), release_ident=release.ident, work_ident=release.work_id, release_stage=release.release_stage, release_year=release.release_year, - index=ref.get("index"), - key=ref.get("id"), + index=ref_index, + key=clean_ref_key(ref.get("id")), locator=None, # target_release_id ref_source="grobid", @@ -658,14 +714,6 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: for ref in release.refs: ref_source = "fatcat" - key = ref.key - if key and release.ext_ids.doi and key.startswith(release.ext_ids.doi): - key = key.replace(release.ext_ids.doi, "") - if key and key.startswith("ref-"): - key = key[4:] - if key and key.startswith("b"): - key = key[1:] - if release.extra and release.extra.get("pubmed"): ref_source = "fatcat-pubmed" elif release.extra and release.extra.get("crossref"): @@ -676,6 +724,10 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: extra = ref.extra or dict() authors = extra.get("authors") or [] authors = [a for a in authors if type(a) == str] + ref_index = None + if ref.index is not None: + # transform from 0-indexed (release.refs) to 1-indexed (fatcat_refs) + ref_index = ref.index + 1 output.append( RefStructured( biblio=RefBiblio( @@ -689,18 +741,19 @@ def refs_from_release_refs(release: ReleaseEntity) -> List[RefStructured]: volume=extra.get("volume"), issue=extra.get("issue"), pages=extra.get("pages") or extra.get("page"), - doi=extra.get("doi"), + doi=clean_doi(extra.get("doi")), pmid=extra.get("pmid"), - pmcid=extra.get("pmcid"), + pmcid=clean_pmcid(extra.get("pmcid")), arxiv_id=extra.get("arxiv_id"), - isbn13=extra.get("isbn13"), + isbn=extra.get("isbn13") or extra.get("isbn"), url=clean_url_conservative(extra.get("url")), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, - index=ref.index, - key=key or None, + index=ref_index, + key=clean_ref_key(ref.key, doi=release.ext_ids.doi), locator=ref.locator, target_release_id=ref.target_release_id, ref_source=ref_source, @@ -724,26 +777,41 @@ def refs_from_crossref( authors = [ ref["author"], ] - key = ref.get("key") - if key and key.startswith(record["DOI"]): - key = key.replace(record["DOI"] + "-", "") - key = key.replace(record["DOI"], "") - if key and key.startswith("ref-"): - key = key[4:] + ref_title = ref.get("article-title") ref_container_name = ref.get("journal-title") if not ref_container_name: + ref_container_name = ref.get("container-title") + + # volume-title is often a book title + if not ref_title: + ref_title = ref.get("volume-title") + elif not ref_container_name: ref_container_name = ref.get("volume-title") + + # series-title is a bit weird in Crossref references. it is often + # passed alone and seems to be the article/book title miscategorized. + # other times it is a conference name. + series_title = ref.get("series-title") + if not ref_title: + ref_title = series_title + elif not ref_container_name: + ref_container_name = series_title + + year = ref.get("year") + if year: + year = clean_small_int(year) + else: + year = None date = ref.get("date") - year = None - if date and len(date) >= 4 and date[:4].isdigit(): + if date and not year and len(date) >= 4 and date[:4].isdigit(): year = int(date[:4]) - if year < 1000 or year > 2100: - year = None + if year and (year < 1000 or year > 2100): + year = None output.append( RefStructured( biblio=RefBiblio( unstructured=ref.get("unstructured"), - title=ref.get("article-title"), + title=ref_title, subtitle=ref.get("subtitle"), contrib_raw_names=authors, year=year, @@ -751,15 +819,18 @@ def refs_from_crossref( publisher=ref.get("publisher"), volume=ref.get("volume"), issue=ref.get("issue"), - pages=ref.get("page"), - doi=ref.get("DOI"), + pages=ref.get("first-page"), + version=ref.get("edition"), + doi=clean_doi(ref.get("DOI")), + isbn=ref.get("ISBN"), ), release_ident=release.ident, work_ident=release.work_id, + release_stage=release.release_stage, release_year=release.release_year, - index=i, - key=key or None, - locator=ref.get("first-page"), + index=i + 1, # 1-indexed + key=clean_ref_key(ref.get("key"), doi=record.get("DOI")), + # locator, target_release_id=None, ref_source=ref_source, ) @@ -795,7 +866,10 @@ def refs_from_heavy(heavy: IntermediateBundle) -> Sequence[RefStructured]: fulltext_refs: List[RefStructured] = [] # TODO: this crude filter should not be necessary once we upgrade to GROBID v0.6+ - if heavy.grobid_fulltext and heavy.grobid_fulltext.get('file_ident') != 'gbbvrg2tpzan5hl3qcsfzh4vfq': + if ( + heavy.grobid_fulltext + and heavy.grobid_fulltext.get("file_ident") != "gbbvrg2tpzan5hl3qcsfzh4vfq" + ): fulltext_release = [ r for r in heavy.releases diff --git a/fatcat_scholar/web.py b/fatcat_scholar/web.py index b5af18e..a705e20 100644 --- a/fatcat_scholar/web.py +++ b/fatcat_scholar/web.py @@ -20,6 +20,7 @@ from fastapi.responses import ( RedirectResponse, ) from fastapi.middleware.cors import CORSMiddleware +import fatcat_openapi_client import sentry_sdk from sentry_sdk.integrations.asgi import SentryAsgiMiddleware from starlette_prometheus import metrics, PrometheusMiddleware @@ -182,72 +183,6 @@ def get_work(work_ident: str = Query(..., min_length=20, max_length=20)) -> dict return doc -@api.get( - "/work/{work_ident}/access/wayback/{url:path}", - operation_id="access_redirect_wayback", - include_in_schema=False, -) -def access_redirect_wayback( - url: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - raw_original_url = "/".join(str(request.url).split("/")[7:]) - # the quote() call is necessary because the URL is un-encoded in the path parameter - # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d - original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if ( - opt.access_type == "wayback" - and opt.access_url - and "://web.archive.org/web/" in opt.access_url - and opt.access_url.endswith(original_url) - ): - timestamp = opt.access_url.split("/")[4] - if not (len(timestamp) == 14 and timestamp.isdigit()): - continue - access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - -@api.get( - "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", - operation_id="access_redirect_ia_file", - include_in_schema=False, -) -def access_redirect_ia_file( - item: str, - file_path: str, - request: Request, - work_ident: str = Query(..., min_length=20, max_length=20), -) -> Any: - original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) - access_url = f"https://archive.org/download/{item}/{original_path}" - doc_dict = get_es_scholar_doc(f"work_{work_ident}") - if not doc_dict: - raise HTTPException(status_code=404, detail="work not found") - doc: ScholarDoc = doc_dict["_obj"] - # combine fulltext with all access options - access: List[Any] = [] - if doc.fulltext: - access.append(doc.fulltext) - access.extend(doc.access or []) - for opt in access: - if opt.access_type == "ia_file" and opt.access_url == access_url: - return RedirectResponse(access_url, status_code=302) - raise HTTPException(status_code=404, detail="access URL not found") - - web = APIRouter() @@ -413,6 +348,165 @@ def web_work( ) +def access_redirect_fallback( + request: Request, + work_ident: str, + original_url: Optional[str] = None, + archiveorg_path: Optional[str] = None, +) -> Any: + """ + The purpose of this helper is to catch access redirects which would + otherwise return a 404, and "try harder" to find a redirect. + """ + # lookup against the live fatcat API, instead of scholar ES index + api_conf = fatcat_openapi_client.Configuration() + api_conf.host = settings.FATCAT_API_HOST + api_client = fatcat_openapi_client.DefaultApi( + fatcat_openapi_client.ApiClient(api_conf) + ) + + # fetch list of releases for this work from current fatcat catalog. note + # that these releases are not expanded (don't include file entities) + try: + # fetch work entity itself to fail fast (true 404) and handle redirects + work_entity = api_client.get_work(work_ident) + logger.warning( + f"access_redirect_fallback: work_{work_ident} state={work_entity.state} redirect={work_entity.redirect}" + ) + if work_entity.redirect: + work_ident = work_entity.redirect + partial_releases = api_client.get_work_releases( + ident=work_ident, hide="abstracts,references", + ) + except fatcat_openapi_client.ApiException as ae: + raise HTTPException( + status_code=ae.status, + detail=f"Fatcat API call failed for work_{work_ident}", + ) + + # for each release, check for any archive.org access option with the given context + for partial in partial_releases: + release = api_client.get_release( + partial.ident, + expand="files", + # TODO: expand="files,filesets,webcaptures", + hide="abstracts,references", + ) + if not release.files: + continue + for fe in release.files: + for url_pair in fe.urls: + access_url = url_pair.url + if ( + original_url + and "://web.archive.org/web/" in access_url + and access_url.endswith(original_url) + ): + # TODO: test/verify this + timestamp = access_url.split("/")[4] + # if not (len(timestamp) == 14 and timestamp.isdigit()): + # continue + replay_url = ( + f"https://web.archive.org/web/{timestamp}id_/{original_url}" + ) + return RedirectResponse(replay_url, status_code=302) + elif ( + archiveorg_path + and "://archive.org/" in access_url + and archiveorg_path in access_url + ): + return RedirectResponse(access_url, status_code=302) + + # give up and show an error page + lang = LangPrefix(request) + return i18n_templates[lang.code].TemplateResponse( + "access_404.html", + { + "request": request, + "locale": lang.code, + "lang_prefix": lang.prefix, + "work_ident": work_ident, + "original_url": original_url, + "archiveorg_path": archiveorg_path, + }, + status_code=404, + ) + + +@web.get( + "/work/{work_ident}/access/wayback/{url:path}", + operation_id="access_redirect_wayback", + include_in_schema=False, +) +def access_redirect_wayback( + url: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + raw_original_url = "/".join(str(request.url).split("/")[7:]) + # the quote() call is necessary because the URL is un-encoded in the path parameter + # see also: https://github.com/encode/starlette/commit/f997938916d20e955478f60406ef9d293236a16d + original_url = urllib.parse.quote(raw_original_url, safe=":/%#?=@[]!$&'()*+,;",) + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if ( + opt.access_type == "wayback" + and opt.access_url + and "://web.archive.org/web/" in opt.access_url + and opt.access_url.endswith(original_url) + ): + timestamp = opt.access_url.split("/")[4] + if not (len(timestamp) == 14 and timestamp.isdigit()): + continue + access_url = f"https://web.archive.org/web/{timestamp}id_/{original_url}" + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, original_url=original_url + ) + + +@web.get( + "/work/{work_ident}/access/ia_file/{item}/{file_path:path}", + operation_id="access_redirect_ia_file", + include_in_schema=False, +) +def access_redirect_ia_file( + item: str, + file_path: str, + request: Request, + work_ident: str = Query(..., min_length=20, max_length=20), +) -> Any: + original_path = urllib.parse.quote("/".join(str(request.url).split("/")[8:])) + access_url = f"https://archive.org/download/{item}/{original_path}" + doc_dict = get_es_scholar_doc(f"work_{work_ident}") + if not doc_dict: + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + doc: ScholarDoc = doc_dict["_obj"] + # combine fulltext with all access options + access: List[Any] = [] + if doc.fulltext: + access.append(doc.fulltext) + access.extend(doc.access or []) + for opt in access: + if opt.access_type == "ia_file" and opt.access_url == access_url: + return RedirectResponse(access_url, status_code=302) + return access_redirect_fallback( + request, work_ident=work_ident, archiveorg_path=f"/{item}/{original_path}" + ) + + app = FastAPI( title="Fatcat Scholar", description="Fulltext search interface for scholarly web content in the Fatcat catalog. An Internet Archive project.", diff --git a/notes/scaling_works.md b/notes/scaling_works.md index 3b004ef..60b4597 100644 --- a/notes/scaling_works.md +++ b/notes/scaling_works.md @@ -657,3 +657,66 @@ So added `--compress` and the `--tmpdir` (which needed to be created): | esbulk -verbose -size 100 -id key -w 4 -index scholar_fulltext_v01 -type _doc \ 2> /tmp/error.txt 1> /tmp/output.txt +## 2021-06-06 Simple Iteration + +Some new paths, more parallelism, and more conservative file naming/handling, +but otherwise not much changed from the 2020-12-30 run above. + + export JOBDIR=/kubwa/scholar/2021-06-03 + mkdir -p $JOBDIR + cd $JOBDIR + zcat /fast/release_export_expanded.json.gz | split --lines 8000000 - release_export_expanded.split_ -d --additional-suffix .json + + cd /fast/fatcat-scholar + pipenv shell + export TMPDIR=/sandcrawler-db/tmp + + # transform + set -u -o pipefail + for SHARD in {00..20}; do + cat $JOBDIR/release_export_expanded.split_$SHARD.json \ + | parallel -j8 --line-buffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.work_pipeline run_releases \ + | pv -l \ + | pigz \ + > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP \ + && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz + done + + # dump refs + set -u -o pipefail + for SHARD in {00..20}; do + zcat $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.json.gz \ + | pv -l \ + | parallel -j8 --linebuffer --compress --tmpdir $TMPDIR --round-robin --pipe python -m fatcat_scholar.transform run_refs \ + | pigz \ + > $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP \ + && mv $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz.WIP $JOBDIR/fatcat_scholar_work_fulltext.split_$SHARD.refs.json.gz + done + +Ran in to a problem with a single (!) bad TEI-XML document, due to bad text +encoding: + + xml.etree.ElementTree.ParseError: not well-formed (invalid token): line 40, column 1122 + +Root cause was an issue in GROBID, which seems to have been fixed in more +recent versions of GROBID. Patched to continue, and separately commited patch +to fatcat-scholar code base. + +Ran several retries, manually. + +Upload to petabox: + + export BASENAME=scholar_corpus_bundle_2021-06-03 + for SHARD in {00..20}; do + ia upload ${BASENAME}_split-${SHARD} $JOBDIR/README.md $JOBDIR/fatcat_scholar_work_fulltext.split_${SHARD}.json.gz -m collection:"scholarly-tdm" --checksum + done + + ia upload scholar_corpus_refs_2021-06-03 fatcat_scholar_work_fulltext.split_*.refs.json.gz -m collection:"scholarly-tdm" --checksum + + +### Performance Notes (on 2021-06-06 run) + +Recently added crossref refs via sandcrawler-db postgrest lookup. Seem to still +be getting around 40/sec works per second, with a single thread, similar to +previous performance, so not a significant slow down. + diff --git a/settings.toml b/settings.toml index e2bc6d6..07ba1bd 100644 --- a/settings.toml +++ b/settings.toml @@ -5,6 +5,7 @@ SCHOLAR_ISSUEDB_PATH = "data/issue_db.sqlite" I18N_LANG_DEFAULT = "en" ELASTICSEARCH_QUERY_BASE = "http://localhost:9200" ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext" +ELASTICSEARCH_QUERY_PREFERENCE = "" ELASTICSEARCH_WRITE_BASE = "http://localhost:9200" ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01" ELASTICSEARCH_PUBLIC_URL = "http://localhost:9292" @@ -50,14 +51,14 @@ KAFKA_BROKERS = ["localhost"] [development-qa] SCHOLAR_ENV = "dev" -ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292" -ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292" +ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki" +ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki" [qa] SCHOLAR_ENV = "qa" -ELASTICSEARCH_QUERY_BASE = "http://scholar-svc500.fatcat.wiki:9292" +ELASTICSEARCH_QUERY_BASE = "https://search.fatcat.wiki" ELASTICSEARCH_WRITE_BASE = "http://localhost:9200" -ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292" +ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki" KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"] [prod] @@ -65,9 +66,10 @@ SCHOLAR_ENV = "prod" ONION_DOMAIN = "scholar.archivev3qli37bju4rlh27glh24lljyezwxf4pokmrdbpefjlcrp5id.onion" ELASTICSEARCH_QUERY_BASE = "http://localhost:9292" ELASTICSEARCH_QUERY_FULLTEXT_INDEX = "scholar_fulltext" +ELASTICSEARCH_QUERY_PREFERENCE = "_local" ELASTICSEARCH_WRITE_BASE = "http://localhost:9200" ELASTICSEARCH_WRITE_FULLTEXT_INDEX = "scholar_fulltext_v01_20210128" -ELASTICSEARCH_PUBLIC_URL = "http://scholar-svc500.fatcat.wiki:9292" +ELASTICSEARCH_PUBLIC_URL = "https://search.fatcat.wiki" KAFKA_BROKERS = ["wbgrp-svc263.us.archive.org"] ENABLE_GOATCOUNTER = true GOATCOUNTER_ENDPOINT = "/goatcounter/count" diff --git a/tests/files/example_crossref_record.json b/tests/files/example_crossref_record.json new file mode 100644 index 0000000..d87c7c2 --- /dev/null +++ b/tests/files/example_crossref_record.json @@ -0,0 +1,225 @@ +{ + "doi": "10.1515/jpm-2019-0016", + "record": +{ + "DOI": "10.1111/his.12200", + "ISSN": [ + "0309-0167" + ], + "URL": "http://dx.doi.org/10.1111/his.12200", + "author": [ + { + "affiliation": [], + "family": "Stewart", + "given": "Colin J R" + } + ], + "container-title": [ + "Histopathology" + ], + "content-domain": { + "crossmark-restriction": false, + "domain": [] + }, + "created": { + "date-parts": [ + [ + 2013, + 6, + 3 + ] + ], + "date-time": "2013-06-03T16:37:56Z", + "timestamp": 1370277476000 + }, + "deposited": { + "date-parts": [ + [ + 2017, + 6, + 21 + ] + ], + "date-time": "2017-06-21T14:04:36Z", + "timestamp": 1498053876000 + }, + "indexed": { + "date-parts": [ + [ + 2020, + 7, + 28 + ] + ], + "date-time": "2020-07-28T14:37:55Z", + "timestamp": 1595947075455 + }, + "is-referenced-by-count": 0, + "issn-type": [ + { + "type": "print", + "value": "0309-0167" + } + ], + "issued": { + "date-parts": [ + [ + 2013, + 7 + ] + ] + }, + "license": [ + { + "URL": "http://doi.wiley.com/10.1002/tdm_license_1.1", + "content-version": "tdm", + "delay-in-days": 792, + "start": { + "date-parts": [ + [ + 2015, + 9, + 1 + ] + ], + "date-time": "2015-09-01T00:00:00Z", + "timestamp": 1441065600000 + } + } + ], + "link": [ + { + "URL": "https://api.wiley.com/onlinelibrary/tdm/v1/articles/10.1111%2Fhis.12200", + "content-type": "unspecified", + "content-version": "vor", + "intended-application": "text-mining" + } + ], + "member": "311", + "original-title": [], + "page": "n/a-n/a", + "prefix": "10.1111", + "published-online": { + "date-parts": [ + [ + 2013, + 7, + 16 + ] + ] + }, + "published-print": { + "date-parts": [ + [ + 2013, + 7 + ] + ] + }, + "publisher": "Wiley", + "reference": [ + { + "DOI": "10.5858/arpa.2012-0112-RA", + "article-title": "The separation of benign and malignant mesothelial proliferations", + "author": "Churg", + "doi-asserted-by": "crossref", + "first-page": "1217", + "journal-title": "Arch. Pathol. Lab. Med.", + "key": "10.1111/his.12200-BIB0001|his12200-cit-0001", + "volume": "136", + "year": "2012" + }, + { + "DOI": "10.1136/jcp.2010.086074", + "article-title": "Peritoneal mesothelial hyperplasia associated with gynaecological disease: a potential diagnostic pitfall that is commonly associated with endometriosis", + "author": "Opraka", + "doi-asserted-by": "crossref", + "first-page": "313", + "journal-title": "J. Clin. Pathol.", + "key": "10.1111/his.12200-BIB0002|his12200-cit-0002", + "volume": "64", + "year": "2011" + }, + { + "DOI": "10.1038/modpathol.2012.105", + "article-title": "Deciduoid mesothelioma: report of 21 cases with review of the literature", + "author": "Ordonez", + "doi-asserted-by": "crossref", + "first-page": "1481", + "journal-title": "Mod. Pathol.", + "key": "10.1111/his.12200-BIB0003|his12200-cit-0003", + "volume": "25", + "year": "2012" + }, + { + "DOI": "10.1111/j.1525-1438.2006.00509.x", + "article-title": "Atypical reactive ovarian surface epithelium, a pitfall in pathologic assessment", + "author": "Aydin", + "doi-asserted-by": "crossref", + "first-page": "207", + "issue": "Suppl. 1", + "journal-title": "Int. J. Gynecol. Cancer", + "key": "10.1111/his.12200-BIB0004|his12200-cit-0004", + "volume": "16", + "year": "2006" + }, + { + "DOI": "10.1097/PAP.0b013e3180ca7d7b", + "article-title": "The pathology of endometriosis: a survey of the many faces of a common disease emphasizing diagnostic pitfalls and unusual and newly appreciated aspects", + "author": "Clement", + "doi-asserted-by": "crossref", + "first-page": "241", + "journal-title": "Adv. Anat. Pathol.", + "key": "10.1111/his.12200-BIB0005|his12200-cit-0005", + "volume": "14", + "year": "2007" + }, + { + "article-title": "Extramedullary hematopoiesis associated with organizing peritoneal hemorrhage: a report of 5 cases in patients presenting with primary gynecological disorders", + "author": "Mesbah Ardakani", + "journal-title": "Int. J. Gynecol. Pathol.", + "key": "10.1111/his.12200-BIB0006|his12200-cit-0006" + }, + { + "key": "10.1016/B0-12-227090-8/00204-9_bib5", + "series-title": "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference", + "year": "2001" + }, + { + "key": "CIT0041", + "unstructured": "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6." + }, + { + "author": "L Piegl", + "edition": "2", + "key": "576_CR3", + "unstructured": "Piegl L, Tiller W (1997) The NURBS Book, Monographs in Visual Communication, 2nd edn. Springer, Berlin", + "volume-title": "The NURBS Book, Monographs in Visual Communication", + "year": "1997" + } + ], + "reference-count": 6, + "references-count": 6, + "relation": { + "cites": [] + }, + "score": null, + "short-container-title": [ + "Histopathology" + ], + "short-title": [], + "source": "Crossref", + "subject": [ + "Pathology and Forensic Medicine", + "Histology", + "General Medicine" + ], + "subtitle": [], + "title": [ + "Deciduoid mesothelial hyperplasia of the pelvic peritoneum" + ], + "type": "journal-article" +}, + "release_ident": "arzkbn5brjf2nitdy4fkiusc4q" +} + diff --git a/tests/test_refs_transform.py b/tests/test_refs_transform.py index 3fa490b..078b73b 100644 --- a/tests/test_refs_transform.py +++ b/tests/test_refs_transform.py @@ -1,7 +1,8 @@ +import json from fatcat_openapi_client import ReleaseEntity from fatcat_scholar.grobid2json import teixml2json -from fatcat_scholar.transform import refs_from_grobid +from fatcat_scholar.transform import refs_from_grobid, refs_from_crossref def test_transform_refs_grobid() -> None: @@ -27,7 +28,7 @@ def test_transform_refs_grobid() -> None: assert ref.release_year == 1234 assert ref.ref_source == "grobid" assert ref.key == "b12" - assert ref.index == 12 + assert ref.index == 13 assert ref.locator == None assert ref.biblio.contrib_raw_names is not None assert ref.biblio.contrib_raw_names[0] == "K Tasa" @@ -40,3 +41,58 @@ def test_transform_refs_grobid() -> None: ref.biblio.unstructured == "Tasa K, Baker R, Murray M. Using patient feedback for qua- lity improvement. Quality Management in Health Care 1996;8:206-19." ) + + +def test_transform_refs_crossref() -> None: + + with open("tests/files/example_crossref_record.json", "r") as f: + record = json.loads(f.read()) + + dummy_release = ReleaseEntity( + ident="releasedummy22222222222222", + work_id="workdummy22222222222222222", + release_year=1234, + release_stage="accepted", + ext_ids={}, + ) + + refs = refs_from_crossref(dummy_release, record) + + assert refs[0].release_ident == "releasedummy22222222222222" + assert refs[0].work_ident == "workdummy22222222222222222" + assert refs[0].release_stage == "accepted" + assert refs[0].release_year == 1234 + assert refs[0].ref_source == "crossref" + assert refs[0].key == "BIB0001|his12200-cit-0001" + assert refs[0].index == 1 + assert refs[0].locator is None + assert refs[0].biblio.contrib_raw_names is not None + assert refs[0].biblio.contrib_raw_names[0] == "Churg" + assert refs[0].biblio.container_name == "Arch. Pathol. Lab. Med." + assert ( + refs[0].biblio.title + == "The separation of benign and malignant mesothelial proliferations" + ) + assert refs[0].biblio.year == 2012 + assert refs[0].biblio.pages == "1217" + assert refs[0].biblio.volume == "136" + assert refs[0].biblio.doi == "10.5858/arpa.2012-0112-ra" + assert refs[0].biblio.unstructured is None + + assert ( + refs[6].biblio.title + == "Advances in Laser Remote Sensing – Selected Papers Presented at the 20th International Laser Radar Conference" + ) + assert refs[6].biblio.year == 2001 + + assert refs[7].key == "CIT0041" + assert ( + refs[7].biblio.unstructured + == "Linda Weiss,Creating Capitalism. Oxford: Blackwell, 1988. 272 pp. £29.95. ISBN 0 631 15733 6." + ) + + assert refs[8].key == "576_CR3" + assert refs[8].biblio.unstructured is not None + assert refs[8].biblio.title == "The NURBS Book, Monographs in Visual Communication" + assert refs[8].biblio.year == 1997 + assert refs[8].biblio.version == "2" diff --git a/tests/test_web.py b/tests/test_web.py index 7f1f72a..d9cfab6 100644 --- a/tests/test_web.py +++ b/tests/test_web.py @@ -3,6 +3,7 @@ from typing import Any import pytest from fastapi.testclient import TestClient +import fatcat_openapi_client from fatcat_scholar.web import app @@ -148,7 +149,11 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: == "https://web.archive.org/web/20200206164725id_/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf" ) - # check that URL is validated + # check that URL is validated (force fatcat API fallback to fail) + fatcat_api_raw = mocker.patch("fatcat_openapi_client.ApiClient.call_api") + fatcat_api_raw.side_effect = [ + fatcat_openapi_client.ApiException(status=404, reason="dummy") + ] rv = client.get( "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", allow_redirects=False, @@ -156,6 +161,102 @@ def test_basic_access_redirect(client: Any, mocker: Any) -> None: assert rv.status_code == 404 +def test_access_redirect_fallback(client: Any, mocker: Any) -> None: + + with open("tests/files/elastic_fulltext_get.json") as f: + elastic_resp = json.loads(f.read()) + + es_raw = mocker.patch( + "elasticsearch.connection.Urllib3HttpConnection.perform_request" + ) + es_raw.side_effect = [ + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + (200, {}, json.dumps(elastic_resp)), + ] + fatcat_get_work_raw = mocker.patch("fatcat_openapi_client.DefaultApi.get_work") + fatcat_get_work_raw.side_effect = [ + fatcat_openapi_client.WorkEntity( + state="active", ident="wwwwwwwwwwwwwwwwwwwwwwwwww", + ) + ] * 4 + fatcat_get_work_releases_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_work_releases" + ) + fatcat_get_work_releases_raw.side_effect = [ + [ + fatcat_openapi_client.ReleaseEntity( + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + ), + ] + ] * 4 + fatcat_get_release_raw = mocker.patch( + "fatcat_openapi_client.DefaultApi.get_release" + ) + fatcat_get_release_raw.side_effect = [ + fatcat_openapi_client.ReleaseEntity( + state="active", + ident="rrrrrrrrrrrrrrrrrrrrrrrrrr", + ext_ids=fatcat_openapi_client.ReleaseExtIds(), + files=[ + fatcat_openapi_client.FileEntity( + ident="ffffffffffffffffffffffffff", + urls=[ + fatcat_openapi_client.FileUrl( + rel="web", url="https://blarg.example.com", + ), + fatcat_openapi_client.FileUrl( + rel="webarchive", + url="https://web.archive.org/web/12345/https://example.com", + ), + fatcat_openapi_client.FileUrl( + rel="archive", + url="https://archive.org/download/some/thing.pdf", + ), + ], + ), + ], + ) + ] * 4 + + # redirects should work after API lookup, for both wayback and archive.org + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://example.com", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert ( + rv.headers["Location"] + == "https://web.archive.org/web/12345id_/https://example.com" + ) + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.pdf", + allow_redirects=False, + ) + assert rv.status_code == 302 + assert rv.headers["Location"] == "https://archive.org/download/some/thing.pdf" + + # wrong URLs should still not work, but display a page with helpful links + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/wayback/https://www.federalreserve.gov/econresdata/feds/2015/files/2015118pap.pdf.DUMMY", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"web.archive.org/web/*/https://www.federalreserve.gov" in rv.content + + rv = client.get( + "/work/2x5qvct2dnhrbctqa2q2uyut6a/access/ia_file/some/thing.else.pdf", + allow_redirects=False, + ) + assert rv.status_code == 404 + assert b"Access Location Not Found" in rv.content + assert b"archive.org/download/some/thing.else.pdf" in rv.content + + def test_access_redirect_encoding(client: Any, mocker: Any) -> None: with open("tests/files/elastic_get_work_a6gvpil4brdgzhqyaog3ftngqe.json") as f: |