diff options
-rw-r--r-- | extra/stats/2020-02-21-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-02-21-prod-tables-sizes.txt | 48 | ||||
-rw-r--r-- | extra/stats/2020-02-24-prod-dupes.txt | 5 | ||||
-rw-r--r-- | extra/stats/2020-02-24-prod-table-sizes.txt | 47 | ||||
-rw-r--r-- | extra/stats/2020-03-03-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-04-17-prod-stats.json | 1 | ||||
-rw-r--r-- | extra/stats/2020-04-17-prod-table-sizes.txt | 46 | ||||
-rw-r--r-- | python/fatcat_tools/workers/__init__.py | 2 | ||||
-rw-r--r-- | python/fatcat_tools/workers/changelog.py | 29 | ||||
-rw-r--r-- | python/fatcat_tools/workers/elasticsearch.py | 23 | ||||
-rwxr-xr-x | python/fatcat_worker.py | 19 |
11 files changed, 207 insertions, 15 deletions
diff --git a/extra/stats/2020-02-21-prod-stats.json b/extra/stats/2020-02-21-prod-stats.json new file mode 100644 index 00000000..3ab6471f --- /dev/null +++ b/extra/stats/2020-02-21-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":3528195,"timestamp":"2020-02-22T05:23:18.082262+00:00"}},"container":{"total":148396},"papers":{"in_kbart":60529767,"in_web":20374670,"in_web_not_kbart":9598464,"is_oa":11547112,"total":105732384},"release":{"refs_total":890869519,"total":143867045}} diff --git a/extra/stats/2020-02-21-prod-tables-sizes.txt b/extra/stats/2020-02-21-prod-tables-sizes.txt new file mode 100644 index 00000000..bc756ba7 --- /dev/null +++ b/extra/stats/2020-02-21-prod-tables-sizes.txt @@ -0,0 +1,48 @@ + +Size: 478.37G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 53 GB | 43 GB | 96 GB + "public"."release_rev" | 58 GB | 33 GB | 91 GB + "public"."refs_blob" | 85 GB | 2884 MB | 88 GB + "public"."release_edit" | 14 GB | 20 GB | 34 GB + "public"."work_edit" | 13 GB | 20 GB | 34 GB + "public"."release_ident" | 9515 MB | 15 GB | 24 GB + "public"."work_ident" | 9313 MB | 15 GB | 24 GB + "public"."abstracts" | 16 GB | 1504 MB | 18 GB + "public"."file_rev_url" | 10235 MB | 3587 MB | 13 GB + "public"."work_rev" | 6046 MB | 5825 MB | 12 GB + "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB + "public"."file_rev" | 3635 MB | 5359 MB | 8994 MB + "public"."file_edit" | 3111 MB | 4051 MB | 7162 MB + "public"."release_rev_abstract" | 2406 MB | 3342 MB | 5749 MB + "public"."file_ident" | 1848 MB | 2505 MB | 4354 MB + "public"."file_rev_release" | 1698 MB | 2483 MB | 4181 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."editgroup" | 767 MB | 405 MB | 1172 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB + "public"."changelog" | 220 MB | 214 MB | 434 MB + "public"."container_rev" | 75 MB | 23 MB | 98 MB + "public"."container_edit" | 25 MB | 31 MB | 56 MB + "public"."container_ident" | 11 MB | 19 MB | 30 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) + diff --git a/extra/stats/2020-02-24-prod-dupes.txt b/extra/stats/2020-02-24-prod-dupes.txt new file mode 100644 index 00000000..7d1d09cf --- /dev/null +++ b/extra/stats/2020-02-24-prod-dupes.txt @@ -0,0 +1,5 @@ + 19409 doi_ident.dupes.tsv + 28530 pmcid_ident.dupes.tsv + 463523 pmid_ident.dupes.tsv + 2025 sha1_ident.dupes.tsv + 10 wikidata_ident.dupes.tsv diff --git a/extra/stats/2020-02-24-prod-table-sizes.txt b/extra/stats/2020-02-24-prod-table-sizes.txt new file mode 100644 index 00000000..359cb2f3 --- /dev/null +++ b/extra/stats/2020-02-24-prod-table-sizes.txt @@ -0,0 +1,47 @@ + +Size: 560.76G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 53 GB | 43 GB | 96 GB + "public"."release_rev" | 58 GB | 33 GB | 91 GB + "public"."refs_blob" | 85 GB | 2884 MB | 88 GB + "public"."file_rev" | 23 GB | 26 GB | 49 GB + "public"."release_edit" | 14 GB | 20 GB | 34 GB + "public"."work_edit" | 13 GB | 20 GB | 34 GB + "public"."release_ident" | 9517 MB | 15 GB | 24 GB + "public"."work_ident" | 9315 MB | 15 GB | 24 GB + "public"."file_edit" | 9555 MB | 14 GB | 24 GB + "public"."abstracts" | 16 GB | 1505 MB | 18 GB + "public"."file_rev_url" | 13 GB | 4730 MB | 17 GB + "public"."file_ident" | 5885 MB | 9480 MB | 15 GB + "public"."file_rev_release" | 5515 MB | 9536 MB | 15 GB + "public"."work_rev" | 6047 MB | 5825 MB | 12 GB + "public"."release_ref" | 3997 MB | 5690 MB | 9686 MB + "public"."release_rev_abstract" | 2408 MB | 3343 MB | 5751 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."editgroup" | 903 MB | 465 MB | 1368 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."release_rev_extid" | 200 MB | 312 MB | 512 MB + "public"."changelog" | 261 MB | 229 MB | 490 MB + "public"."container_rev" | 75 MB | 23 MB | 98 MB + "public"."container_edit" | 25 MB | 31 MB | 56 MB + "public"."container_ident" | 11 MB | 19 MB | 30 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/extra/stats/2020-03-03-prod-stats.json b/extra/stats/2020-03-03-prod-stats.json new file mode 100644 index 00000000..0ac977b8 --- /dev/null +++ b/extra/stats/2020-03-03-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":4242658,"timestamp":"2020-03-03T18:35:06.153130+00:00"}},"container":{"total":148428},"papers":{"in_kbart":60594053,"in_web":22232097,"in_web_not_kbart":10756782,"is_oa":15267353,"total":105933568},"release":{"refs_total":893136234,"total":144138471}} diff --git a/extra/stats/2020-04-17-prod-stats.json b/extra/stats/2020-04-17-prod-stats.json new file mode 100644 index 00000000..ddf7fca1 --- /dev/null +++ b/extra/stats/2020-04-17-prod-stats.json @@ -0,0 +1 @@ +{"changelog":{"latest":{"index":4460684,"timestamp":"2020-04-17T18:03:34.373631+00:00"}},"container":{"total":149527},"papers":{"in_kbart":60679890,"in_web":24250766,"in_web_not_kbart":11970984,"is_oa":15538739,"total":108761510},"release":{"refs_total":914708032,"total":148081134}} diff --git a/extra/stats/2020-04-17-prod-table-sizes.txt b/extra/stats/2020-04-17-prod-table-sizes.txt new file mode 100644 index 00000000..79aa3b98 --- /dev/null +++ b/extra/stats/2020-04-17-prod-table-sizes.txt @@ -0,0 +1,46 @@ +Size: 591.60G + + table_name | table_size | indexes_size | total_size +---------------------------------------+------------+--------------+------------ + "public"."release_contrib" | 55 GB | 45 GB | 100 GB + "public"."release_rev" | 60 GB | 34 GB | 94 GB + "public"."refs_blob" | 87 GB | 2885 MB | 89 GB + "public"."file_rev" | 26 GB | 29 GB | 55 GB + "public"."release_edit" | 14 GB | 21 GB | 35 GB + "public"."work_edit" | 14 GB | 21 GB | 34 GB + "public"."file_edit" | 11 GB | 16 GB | 27 GB + "public"."release_ident" | 9821 MB | 15 GB | 24 GB + "public"."work_ident" | 9596 MB | 15 GB | 24 GB + "public"."file_rev_url" | 15 GB | 6040 MB | 21 GB + "public"."abstracts" | 18 GB | 1688 MB | 19 GB + "public"."file_ident" | 6694 MB | 10219 MB | 17 GB + "public"."file_rev_release" | 6267 MB | 10109 MB | 16 GB + "public"."work_rev" | 6233 MB | 5825 MB | 12 GB + "public"."release_ref" | 4441 MB | 6322 MB | 11 GB + "public"."release_rev_abstract" | 2637 MB | 3505 MB | 6141 MB + "public"."creator_edit" | 702 MB | 942 MB | 1643 MB + "public"."editgroup" | 980 MB | 502 MB | 1482 MB + "public"."creator_rev" | 695 MB | 719 MB | 1413 MB + "public"."creator_ident" | 474 MB | 648 MB | 1121 MB + "public"."changelog" | 289 MB | 239 MB | 527 MB + "public"."release_rev_extid" | 206 MB | 320 MB | 526 MB + "public"."container_rev" | 75 MB | 23 MB | 98 MB + "public"."container_edit" | 25 MB | 32 MB | 57 MB + "public"."container_ident" | 11 MB | 19 MB | 30 MB + "public"."webcapture_rev_cdx" | 64 kB | 32 kB | 96 kB + "public"."fileset_rev_file" | 48 kB | 32 kB | 80 kB + "public"."auth_oidc" | 16 kB | 48 kB | 64 kB + "public"."editgroup_annotation" | 16 kB | 48 kB | 64 kB + "public"."fileset_edit" | 16 kB | 48 kB | 64 kB + "public"."webcapture_edit" | 16 kB | 48 kB | 64 kB + "public"."editor" | 16 kB | 48 kB | 64 kB + "public"."fileset_rev_url" | 16 kB | 32 kB | 48 kB + "public"."webcapture_rev_url" | 16 kB | 32 kB | 48 kB + "public"."fileset_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_ident" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_rev_release" | 8192 bytes | 32 kB | 40 kB + "public"."webcapture_ident" | 8192 bytes | 32 kB | 40 kB + "public"."fileset_rev" | 16 kB | 16 kB | 32 kB + "public"."webcapture_rev" | 16 kB | 16 kB | 32 kB + "public"."__diesel_schema_migrations" | 8192 bytes | 16 kB | 24 kB +(41 rows) diff --git a/python/fatcat_tools/workers/__init__.py b/python/fatcat_tools/workers/__init__.py index 8bea7cdc..32fd330d 100644 --- a/python/fatcat_tools/workers/__init__.py +++ b/python/fatcat_tools/workers/__init__.py @@ -1,4 +1,4 @@ from .changelog import ChangelogWorker, EntityUpdatesWorker -from .elasticsearch import ElasticsearchReleaseWorker, ElasticsearchContainerWorker +from .elasticsearch import ElasticsearchReleaseWorker, ElasticsearchContainerWorker, ElasticsearchChangelogWorker from .worker_common import most_recent_message, FatcatWorker diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d1e7c2db..3a49f86e 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -153,28 +153,33 @@ class EntityUpdatesWorker(FatcatWorker): doi = ingest_request.get('ext_ids', {}).get('doi') is_document = release.release_type in ( - 'article-journal', - 'paper-conference', 'article', - 'report', + 'article-journal', + 'article-newspaper', + 'book', 'chapter', - 'manuscript', - 'review', - 'thesis', - 'letter', 'editorial', - 'abstract', - 'entry', + 'interview', + 'legal_case', + 'legislation', + 'letter', + 'manuscript', + 'paper-conference', 'patent', - 'post', + 'peer_review', + 'report', + 'retraction', + 'review', 'review-book', + 'thesis', ) is_not_pdf = release.release_type in ( + 'component', 'dataset', - 'stub', - 'software', 'figure', 'graphic', + 'software', + 'stub', ) # accept list sets a default "crawl it" despite OA metadata for diff --git a/python/fatcat_tools/workers/elasticsearch.py b/python/fatcat_tools/workers/elasticsearch.py index 68d6c304..525f372b 100644 --- a/python/fatcat_tools/workers/elasticsearch.py +++ b/python/fatcat_tools/workers/elasticsearch.py @@ -4,7 +4,7 @@ import time import requests from confluent_kafka import Consumer, KafkaException -from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient +from fatcat_openapi_client import ReleaseEntity, ContainerEntity, ApiClient, ChangelogEntry from fatcat_tools import * from .worker_common import FatcatWorker @@ -148,3 +148,24 @@ class ElasticsearchContainerWorker(ElasticsearchReleaseWorker): self.elasticsearch_document_name = "container" self.transform_func = container_to_elasticsearch + +class ElasticsearchChangelogWorker(ElasticsearchReleaseWorker): + """ + Pulls changelog messages from Kafka, runs transformations and indexes them. + + Note: Very early versions of changelog entries did not contain details + about the editor or extra fields. + """ + def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, + elasticsearch_backend="http://localhost:9200", elasticsearch_index="fatcat_changelog", + batch_size=200): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic) + self.consumer_group = "elasticsearch-updates3" + self.batch_size = batch_size + self.poll_interval = poll_interval + self.elasticsearch_backend = elasticsearch_backend + self.elasticsearch_index = elasticsearch_index + self.entity_type = ChangelogEntry + self.elasticsearch_document_name = "changelog" + self.transform_func = changelog_to_elasticsearch diff --git a/python/fatcat_worker.py b/python/fatcat_worker.py index bfb87a72..03167a3a 100755 --- a/python/fatcat_worker.py +++ b/python/fatcat_worker.py @@ -6,7 +6,7 @@ import datetime import raven from fatcat_tools import public_api -from fatcat_tools.workers import ChangelogWorker, EntityUpdatesWorker, ElasticsearchReleaseWorker, ElasticsearchContainerWorker +from fatcat_tools.workers import ChangelogWorker, EntityUpdatesWorker, ElasticsearchReleaseWorker, ElasticsearchContainerWorker, ElasticsearchChangelogWorker # Yep, a global. Gets DSN from `SENTRY_DSN` environment variable sentry_client = raven.Client() @@ -47,6 +47,13 @@ def run_elasticsearch_container(args): elasticsearch_index=args.elasticsearch_index) worker.run() +def run_elasticsearch_changelog(args): + consume_topic = "fatcat-{}.changelog".format(args.env) + worker = ElasticsearchChangelogWorker(args.kafka_hosts, consume_topic, + elasticsearch_backend=args.elasticsearch_backend, + elasticsearch_index=args.elasticsearch_index) + worker.run() + def main(): parser = argparse.ArgumentParser( formatter_class=argparse.ArgumentDefaultsHelpFormatter) @@ -92,6 +99,16 @@ def main(): help="elasticsearch index to push into", default="fatcat_container") + sub_elasticsearch_changelog = subparsers.add_parser('elasticsearch-changelog', + help="consume changelog kafka feed, transform and push to search") + sub_elasticsearch_changelog.set_defaults(func=run_elasticsearch_changelog) + sub_elasticsearch_changelog.add_argument('--elasticsearch-backend', + help="elasticsearch backend to connect to", + default="http://localhost:9200") + sub_elasticsearch_changelog.add_argument('--elasticsearch-index', + help="elasticsearch index to push into", + default="fatcat_changelog") + args = parser.parse_args() if not args.__dict__.get("func"): print("tell me what to do!") |