diff options
-rw-r--r-- | extra/sql_dumps/README.md | 6 | ||||
-rwxr-xr-x | python/fatcat_cleanup.py | 4 | ||||
-rwxr-xr-x | python/fatcat_harvest.py | 5 | ||||
-rwxr-xr-x | python/fatcat_import.py | 4 | ||||
-rwxr-xr-x | python/fatcat_ingest.py | 6 | ||||
-rwxr-xr-x | python/fatcat_review.py | 4 | ||||
-rw-r--r-- | python/fatcat_tools/importers/datacite.py | 34 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 2 | ||||
-rw-r--r-- | python/fatcat_web/__init__.py | 6 | ||||
-rw-r--r-- | python/fatcat_web/web_config.py | 9 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 4 |
11 files changed, 54 insertions, 30 deletions
diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md index 7ce59754..2c73fd20 100644 --- a/extra/sql_dumps/README.md +++ b/extra/sql_dumps/README.md @@ -96,9 +96,13 @@ This dump will contain all tables in the backend schema, except for "private" authentication tables. For local or non-production machines, might need to replace the `fatcat_prod` database name. +Note that prior to 2022, public dumps were in `--format=tar`, but this results +in many temporary files being written to disk, which causes unnecessarly load +on the server. + # TODO: for production, probably want consistent serialization mode export DATESLUG="`date +%Y-%m-%d.%H%M%S`" - sudo -u postgres pg_dump --verbose --format=tar --exclude-table-data=auth_oidc fatcat_prod | pigz > /srv/fatcat/snapshots/fatcat_public_dbdump_${DATESLUG}.tar.gz + sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=auth_oidc fatcat_prod > /srv/fatcat/snapshots/fatcat_public_dbdump_${DATESLUG}.pgdump Can also run using the remote/SSH options above. diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py index b5e50425..b2bd4f0f 100755 --- a/python/fatcat_cleanup.py +++ b/python/fatcat_cleanup.py @@ -9,9 +9,6 @@ import sentry_sdk from fatcat_tools import authenticated_api from fatcat_tools.cleanups import FileCleaner, JsonLinePusher -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def run_files(args: argparse.Namespace) -> None: fmi = FileCleaner( @@ -71,6 +68,7 @@ def main() -> None: # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) + sentry_sdk.init(environment=args.env) args.func(args) diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py index 4180dc16..8d86bca7 100755 --- a/python/fatcat_harvest.py +++ b/python/fatcat_harvest.py @@ -15,9 +15,6 @@ from fatcat_tools.harvest import ( PubmedFTPWorker, ) -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def run_crossref(args: argparse.Namespace) -> None: worker = HarvestCrossrefWorker( @@ -145,6 +142,8 @@ def main() -> None: if not args.__dict__.get("func"): print("tell me what to do!") sys.exit(-1) + + sentry_sdk.init(environment=args.env) args.func(args) diff --git a/python/fatcat_import.py b/python/fatcat_import.py index 5c480fc5..f502d4ed 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -44,9 +44,6 @@ from fatcat_tools.importers import ( SqlitePusher, ) -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def run_crossref(args: argparse.Namespace) -> None: fci = CrossrefImporter( @@ -1022,6 +1019,7 @@ def main() -> None: # token is an optional kwarg (can be empty string, None, etc) token=os.environ.get(args.auth_var), ) + sentry_sdk.init() args.func(args) diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py index 3f8666ca..96fef8fe 100755 --- a/python/fatcat_ingest.py +++ b/python/fatcat_ingest.py @@ -17,16 +17,13 @@ from elasticsearch_dsl import Q, Search from fatcat_tools import kafka_fail_fast, public_api, simple_kafka_producer from fatcat_tools.transforms import release_ingest_request -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def _init_search(args: argparse.Namespace) -> Search: # ensure API connection works args.api.get_changelog() - client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint) + client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint, timeout=120.0) search = Search(using=client, index=args.elasticsearch_index) return search @@ -267,6 +264,7 @@ def main() -> None: sys.exit(-1) args.api = public_api(args.fatcat_api_url) + sentry_sdk.init(environment=args.env) args.func(args) diff --git a/python/fatcat_review.py b/python/fatcat_review.py index 05a00681..599aeea3 100755 --- a/python/fatcat_review.py +++ b/python/fatcat_review.py @@ -8,9 +8,6 @@ import sentry_sdk from fatcat_tools import authenticated_api from fatcat_tools.reviewers import DummyReviewBot -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def run_dummy(args: argparse.Namespace) -> None: reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, verbose=args.verbose) @@ -60,6 +57,7 @@ def main() -> None: sys.exit(-1) args.api = authenticated_api(args.fatcat_api_url) + sentry_sdk.init(environment=args.env) args.func(args) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index b310f8bc..1d098aca 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -511,6 +511,8 @@ class DataciteImporter(EntityImporter): ): relations.append(rel) + # TODO: could use many of these relations to do release/work grouping + if relations: extra_datacite["relations"] = relations @@ -646,6 +648,38 @@ class DataciteImporter(EntityImporter): ): re.extra["container_name"] = "figshare.com" + # Columbia Institutional Repository includes full bibliographic + # metadata, which results in incorrect container_id matches. But this + # DOI prefix also publishes actual journals! + if ( + re.ext_ids.doi.startswith("10.7916/") + and "-" in re.ext_ids.doi + and re.publisher == "Columbia University" + and re.extra + and re.extra.get("datacite") + ): + for relation in re.extra["datacite"].get("relations", []): + if relation.get("relationType") == "IsVariantFormOf": + re.container_id = None + if re.release_stage in ("published", None): + re.release_stage = "submitted" + + # several institutional and other repositories (including "RWTH" and + # "DESY") also results in incorrect container_id matches. + # This probably doesn't filter out enough, but is a start. + IR_DOI_PREFIXES = [ + "10.15495/epub_ubt_", + "10.18154/rwth-20", + "10.3204/pubdb-", + "10.3204/phppubdb-", + "10.26204/kluedo/", + ] + for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get("datacite"): + if re.ext_ids.doi.startswith(prefix): + for relation in re.extra["datacite"].get("relations", []): + if relation.get("relationType") == "IsVariantFormOf": + re.container_id = None + return re def try_update(self, re: ReleaseEntity) -> bool: diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index 1af47d4b..436e0e00 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -131,6 +131,8 @@ class EntityUpdatesWorker(FatcatWorker): "10.1088/", # JSTOR: mostly blocks crawler "10.2307/", + # arxiv: duplicates with arxiv identifiers (temporary) + "10.48550/", ] self.live_pdf_ingest_doi_prefix_acceptlist = [ # biorxiv and medrxiv diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py index 9877d98e..0725c2e2 100644 --- a/python/fatcat_web/__init__.py +++ b/python/fatcat_web/__init__.py @@ -45,7 +45,11 @@ login_manager.login_view = "/auth/login" oauth = OAuth(app) # Grabs sentry config from SENTRY_DSN environment variable -sentry_sdk.init(integrations=[FlaskIntegration()]) +sentry_sdk.init( + integrations=[FlaskIntegration()], + release=Config.GIT_RELEASE, + environment=Config.FATCAT_DOMAIN, +) conf = fatcat_openapi_client.Configuration() conf.host = Config.FATCAT_API_HOST diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py index fb3b55ab..904189d6 100644 --- a/python/fatcat_web/web_config.py +++ b/python/fatcat_web/web_config.py @@ -157,12 +157,3 @@ class Config(object): except Exception as e: print("WARNING: couldn't set sentry git release automatically: " + str(e)) GIT_RELEASE = None - - SENTRY_CONFIG = { - #'include_paths': ['fatcat_web', 'fatcat_openapi_client', 'fatcat_tools'], - "enable-threads": True, # for uWSGI - "release": GIT_RELEASE, - "tags": { - "fatcat_domain": FATCAT_DOMAIN, - }, - } diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index a7dcf755..70d172f7 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -15,9 +15,6 @@ from fatcat_tools.workers import ( EntityUpdatesWorker, ) -# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable -sentry_client = sentry_sdk.init() - def run_changelog(args: argparse.Namespace) -> None: topic = "fatcat-{}.changelog".format(args.env) @@ -200,6 +197,7 @@ def main() -> None: sys.exit(-1) args.api = public_api(args.api_host_url) + sentry_sdk.init(environment=args.env) args.func(args) |