aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
-rw-r--r--extra/sql_dumps/README.md6
-rwxr-xr-xpython/fatcat_cleanup.py4
-rwxr-xr-xpython/fatcat_harvest.py5
-rwxr-xr-xpython/fatcat_import.py4
-rwxr-xr-xpython/fatcat_ingest.py6
-rwxr-xr-xpython/fatcat_review.py4
-rw-r--r--python/fatcat_tools/importers/datacite.py34
-rw-r--r--python/fatcat_tools/workers/changelog.py2
-rw-r--r--python/fatcat_web/__init__.py6
-rw-r--r--python/fatcat_web/web_config.py9
-rwxr-xr-xpython/fatcat_worker.py4
11 files changed, 54 insertions, 30 deletions
diff --git a/extra/sql_dumps/README.md b/extra/sql_dumps/README.md
index 7ce59754..2c73fd20 100644
--- a/extra/sql_dumps/README.md
+++ b/extra/sql_dumps/README.md
@@ -96,9 +96,13 @@ This dump will contain all tables in the backend schema, except for "private"
authentication tables. For local or non-production machines, might need to
replace the `fatcat_prod` database name.
+Note that prior to 2022, public dumps were in `--format=tar`, but this results
+in many temporary files being written to disk, which causes unnecessarly load
+on the server.
+
# TODO: for production, probably want consistent serialization mode
export DATESLUG="`date +%Y-%m-%d.%H%M%S`"
- sudo -u postgres pg_dump --verbose --format=tar --exclude-table-data=auth_oidc fatcat_prod | pigz > /srv/fatcat/snapshots/fatcat_public_dbdump_${DATESLUG}.tar.gz
+ sudo -u postgres pg_dump --verbose --format=custom --exclude-table-data=auth_oidc fatcat_prod > /srv/fatcat/snapshots/fatcat_public_dbdump_${DATESLUG}.pgdump
Can also run using the remote/SSH options above.
diff --git a/python/fatcat_cleanup.py b/python/fatcat_cleanup.py
index b5e50425..b2bd4f0f 100755
--- a/python/fatcat_cleanup.py
+++ b/python/fatcat_cleanup.py
@@ -9,9 +9,6 @@ import sentry_sdk
from fatcat_tools import authenticated_api
from fatcat_tools.cleanups import FileCleaner, JsonLinePusher
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def run_files(args: argparse.Namespace) -> None:
fmi = FileCleaner(
@@ -71,6 +68,7 @@ def main() -> None:
# token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var),
)
+ sentry_sdk.init(environment=args.env)
args.func(args)
diff --git a/python/fatcat_harvest.py b/python/fatcat_harvest.py
index 4180dc16..8d86bca7 100755
--- a/python/fatcat_harvest.py
+++ b/python/fatcat_harvest.py
@@ -15,9 +15,6 @@ from fatcat_tools.harvest import (
PubmedFTPWorker,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def run_crossref(args: argparse.Namespace) -> None:
worker = HarvestCrossrefWorker(
@@ -145,6 +142,8 @@ def main() -> None:
if not args.__dict__.get("func"):
print("tell me what to do!")
sys.exit(-1)
+
+ sentry_sdk.init(environment=args.env)
args.func(args)
diff --git a/python/fatcat_import.py b/python/fatcat_import.py
index 5c480fc5..f502d4ed 100755
--- a/python/fatcat_import.py
+++ b/python/fatcat_import.py
@@ -44,9 +44,6 @@ from fatcat_tools.importers import (
SqlitePusher,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def run_crossref(args: argparse.Namespace) -> None:
fci = CrossrefImporter(
@@ -1022,6 +1019,7 @@ def main() -> None:
# token is an optional kwarg (can be empty string, None, etc)
token=os.environ.get(args.auth_var),
)
+ sentry_sdk.init()
args.func(args)
diff --git a/python/fatcat_ingest.py b/python/fatcat_ingest.py
index 3f8666ca..96fef8fe 100755
--- a/python/fatcat_ingest.py
+++ b/python/fatcat_ingest.py
@@ -17,16 +17,13 @@ from elasticsearch_dsl import Q, Search
from fatcat_tools import kafka_fail_fast, public_api, simple_kafka_producer
from fatcat_tools.transforms import release_ingest_request
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def _init_search(args: argparse.Namespace) -> Search:
# ensure API connection works
args.api.get_changelog()
- client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint)
+ client = elasticsearch.Elasticsearch(args.elasticsearch_endpoint, timeout=120.0)
search = Search(using=client, index=args.elasticsearch_index)
return search
@@ -267,6 +264,7 @@ def main() -> None:
sys.exit(-1)
args.api = public_api(args.fatcat_api_url)
+ sentry_sdk.init(environment=args.env)
args.func(args)
diff --git a/python/fatcat_review.py b/python/fatcat_review.py
index 05a00681..599aeea3 100755
--- a/python/fatcat_review.py
+++ b/python/fatcat_review.py
@@ -8,9 +8,6 @@ import sentry_sdk
from fatcat_tools import authenticated_api
from fatcat_tools.reviewers import DummyReviewBot
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def run_dummy(args: argparse.Namespace) -> None:
reviewer = DummyReviewBot(args.api, poll_interval=args.poll_interval, verbose=args.verbose)
@@ -60,6 +57,7 @@ def main() -> None:
sys.exit(-1)
args.api = authenticated_api(args.fatcat_api_url)
+ sentry_sdk.init(environment=args.env)
args.func(args)
diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py
index b310f8bc..1d098aca 100644
--- a/python/fatcat_tools/importers/datacite.py
+++ b/python/fatcat_tools/importers/datacite.py
@@ -511,6 +511,8 @@ class DataciteImporter(EntityImporter):
):
relations.append(rel)
+ # TODO: could use many of these relations to do release/work grouping
+
if relations:
extra_datacite["relations"] = relations
@@ -646,6 +648,38 @@ class DataciteImporter(EntityImporter):
):
re.extra["container_name"] = "figshare.com"
+ # Columbia Institutional Repository includes full bibliographic
+ # metadata, which results in incorrect container_id matches. But this
+ # DOI prefix also publishes actual journals!
+ if (
+ re.ext_ids.doi.startswith("10.7916/")
+ and "-" in re.ext_ids.doi
+ and re.publisher == "Columbia University"
+ and re.extra
+ and re.extra.get("datacite")
+ ):
+ for relation in re.extra["datacite"].get("relations", []):
+ if relation.get("relationType") == "IsVariantFormOf":
+ re.container_id = None
+ if re.release_stage in ("published", None):
+ re.release_stage = "submitted"
+
+ # several institutional and other repositories (including "RWTH" and
+ # "DESY") also results in incorrect container_id matches.
+ # This probably doesn't filter out enough, but is a start.
+ IR_DOI_PREFIXES = [
+ "10.15495/epub_ubt_",
+ "10.18154/rwth-20",
+ "10.3204/pubdb-",
+ "10.3204/phppubdb-",
+ "10.26204/kluedo/",
+ ]
+ for prefix in IR_DOI_PREFIXES and re.extra and re.extra.get("datacite"):
+ if re.ext_ids.doi.startswith(prefix):
+ for relation in re.extra["datacite"].get("relations", []):
+ if relation.get("relationType") == "IsVariantFormOf":
+ re.container_id = None
+
return re
def try_update(self, re: ReleaseEntity) -> bool:
diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py
index 1af47d4b..436e0e00 100644
--- a/python/fatcat_tools/workers/changelog.py
+++ b/python/fatcat_tools/workers/changelog.py
@@ -131,6 +131,8 @@ class EntityUpdatesWorker(FatcatWorker):
"10.1088/",
# JSTOR: mostly blocks crawler
"10.2307/",
+ # arxiv: duplicates with arxiv identifiers (temporary)
+ "10.48550/",
]
self.live_pdf_ingest_doi_prefix_acceptlist = [
# biorxiv and medrxiv
diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py
index 9877d98e..0725c2e2 100644
--- a/python/fatcat_web/__init__.py
+++ b/python/fatcat_web/__init__.py
@@ -45,7 +45,11 @@ login_manager.login_view = "/auth/login"
oauth = OAuth(app)
# Grabs sentry config from SENTRY_DSN environment variable
-sentry_sdk.init(integrations=[FlaskIntegration()])
+sentry_sdk.init(
+ integrations=[FlaskIntegration()],
+ release=Config.GIT_RELEASE,
+ environment=Config.FATCAT_DOMAIN,
+)
conf = fatcat_openapi_client.Configuration()
conf.host = Config.FATCAT_API_HOST
diff --git a/python/fatcat_web/web_config.py b/python/fatcat_web/web_config.py
index fb3b55ab..904189d6 100644
--- a/python/fatcat_web/web_config.py
+++ b/python/fatcat_web/web_config.py
@@ -157,12 +157,3 @@ class Config(object):
except Exception as e:
print("WARNING: couldn't set sentry git release automatically: " + str(e))
GIT_RELEASE = None
-
- SENTRY_CONFIG = {
- #'include_paths': ['fatcat_web', 'fatcat_openapi_client', 'fatcat_tools'],
- "enable-threads": True, # for uWSGI
- "release": GIT_RELEASE,
- "tags": {
- "fatcat_domain": FATCAT_DOMAIN,
- },
- }
diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py
index a7dcf755..70d172f7 100755
--- a/python/fatcat_worker.py
+++ b/python/fatcat_worker.py
@@ -15,9 +15,6 @@ from fatcat_tools.workers import (
EntityUpdatesWorker,
)
-# Yep, a global. Gets DSN from `SENTRY_DSN` environment variable
-sentry_client = sentry_sdk.init()
-
def run_changelog(args: argparse.Namespace) -> None:
topic = "fatcat-{}.changelog".format(args.env)
@@ -200,6 +197,7 @@ def main() -> None:
sys.exit(-1)
args.api = public_api(args.api_host_url)
+ sentry_sdk.init(environment=args.env)
args.func(args)