diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 18:14:44 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 18:14:44 -0700 |
commit | 48ee0ea51425d6050bb165632fa8bbf4a5b84a47 (patch) | |
tree | 4b4ef4f9e25eeaefe8bfb80d46426ffb76a8c0ac | |
parent | b66342bab2b58f94053066d9785fade037837b45 (diff) | |
download | fatcat-scholar-48ee0ea51425d6050bb165632fa8bbf4a5b84a47.tar.gz fatcat-scholar-48ee0ea51425d6050bb165632fa8bbf4a5b84a47.zip |
compute and use tags
-rw-r--r-- | fatcat_scholar/search.py | 3 | ||||
-rw-r--r-- | fatcat_scholar/transform.py | 41 |
2 files changed, 42 insertions, 2 deletions
diff --git a/fatcat_scholar/search.py b/fatcat_scholar/search.py index 76a2f70..d29e03b 100644 --- a/fatcat_scholar/search.py +++ b/fatcat_scholar/search.py @@ -123,8 +123,7 @@ def do_fulltext_search(query: FulltextQuery, deep_page_limit: int = 2000) -> Ful # availability filters if query.filter_availability == "oa": - # TODO: real OA filter/flag - search = search.filter("term", tag="OA") + search = search.filter("term", tag="oa") elif query.filter_availability == "everything": pass elif query.filter_availability == "fulltext" or query.filter_availability == None: diff --git a/fatcat_scholar/transform.py b/fatcat_scholar/transform.py index ab63aa6..4538d70 100644 --- a/fatcat_scholar/transform.py +++ b/fatcat_scholar/transform.py @@ -243,6 +243,47 @@ def transform_heavy(heavy: IntermediateBundle) -> Optional[ScholarDoc]: # TODO: additional abstracts + # tags + if biblio.license_slug and biblio.license_slug.lower().startswith("cc-"): + tags.append('oa') + if primary_release and primary_release.container: + container = primary_release.container + if container.extra: + if container.extra.get('doaj'): + tags.append('doaj') + tags.append('oa') + if container.extra.get('road'): + tags.append('road') + tags.append('oa') + if container.extra('szczepanski'): + tags.append('szczepanski') + tags.append('oa') + if container.extra.get('ia', {}).get('longtail_oa'): + tags.append('longtail') + tags.append('oa') + if container.extra.get('sherpa_romeo', {}).get('color') == 'white': + tags.append('oa') + if container.extra.get('default_license', '').lower().startswith('cc-'): + tags.append('oa') + if container.extra.get('platform'): + # scielo, ojs, wordpress, etc + tags.append(container.extra['platform'].lower()) + if biblio.doi_prefix == '10.2307': + tags.append('jstor') + + # biorxiv/medrxiv hacks + if not biblio.container_name and biblio.release_stage != "published": + for _, acc in access_dict.items(): + if "://www.medrxiv.org/" in acc.access_url: + biblio.container_name = 'medRxiv' + if biblio.release_stage == None: + biblio.release_stage = "submitted" + elif "://www.biorxiv.org/" in acc.access_url: + biblio.container_name = 'bioRxiv' + if biblio.release_stage == None: + biblio.release_stage = "submitted" + tags = list(set(tags)) + return ScholarDoc( key=key, doc_type=heavy.doc_type.value, |