From 0071b77eb7fc20be4af1bbf9b6c0bfcb4e26816a Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 16 Apr 2020 01:17:45 +0200 Subject: changelog: extend release_types considered documents according to release_rev.release_type, we have 29 values: fatcat_prod=# select release_type, count(release_type) from release_rev group by release_type; release_type | count -------------------+----------- abstract | 2264 article | 6371076 article-journal | 101083841 article-newspaper | 17062 book | 1676941 chapter | 13914854 component | 58990 dataset | 6860325 editorial | 133573 entry | 1628487 graphic | 1809471 interview | 19898 legal_case | 3581 legislation | 1626 letter | 275119 paper-conference | 6074669 peer_review | 30581 post | 245807 post-weblog | 135 report | 1010699 retraction | 1292 review-book | 96219 software | 316 song | 24027 speech | 4263 standard | 312364 stub | 1036813 thesis | 414397 | 0 (29 rows) --- python/fatcat_tools/workers/changelog.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) (limited to 'python/fatcat_tools/workers') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index d1e7c2db..fcf3f031 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -153,28 +153,37 @@ class EntityUpdatesWorker(FatcatWorker): doi = ingest_request.get('ext_ids', {}).get('doi') is_document = release.release_type in ( - 'article-journal', - 'paper-conference', + 'abstract', 'article', - 'report', + 'article-journal', + 'article-newspaper', + 'book', 'chapter', - 'manuscript', - 'review', - 'thesis', - 'letter', + 'component', 'editorial', - 'abstract', 'entry', + 'interview', + 'legal_case', + 'legislation', + 'letter', + 'manuscript', + 'paper-conference', 'patent', + 'peer_review', 'post', + 'post-weblog', + 'report', + 'retraction', + 'review', 'review-book', + 'thesis', ) is_not_pdf = release.release_type in ( 'dataset', - 'stub', - 'software', 'figure', 'graphic', + 'software', + 'stub', ) # accept list sets a default "crawl it" despite OA metadata for -- cgit v1.2.3 From e063d2f470951f1735ccca8c6ea4b37029a6fede Mon Sep 17 00:00:00 2001 From: Martin Czygan Date: Thu, 16 Apr 2020 20:54:20 +0200 Subject: changelog: limit types No partial docs (e.g. abstract), too generic components and entries, not HTML blogs. --- python/fatcat_tools/workers/changelog.py | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) (limited to 'python/fatcat_tools/workers') diff --git a/python/fatcat_tools/workers/changelog.py b/python/fatcat_tools/workers/changelog.py index fcf3f031..3a49f86e 100644 --- a/python/fatcat_tools/workers/changelog.py +++ b/python/fatcat_tools/workers/changelog.py @@ -153,15 +153,12 @@ class EntityUpdatesWorker(FatcatWorker): doi = ingest_request.get('ext_ids', {}).get('doi') is_document = release.release_type in ( - 'abstract', 'article', 'article-journal', 'article-newspaper', 'book', 'chapter', - 'component', 'editorial', - 'entry', 'interview', 'legal_case', 'legislation', @@ -170,8 +167,6 @@ class EntityUpdatesWorker(FatcatWorker): 'paper-conference', 'patent', 'peer_review', - 'post', - 'post-weblog', 'report', 'retraction', 'review', @@ -179,6 +174,7 @@ class EntityUpdatesWorker(FatcatWorker): 'thesis', ) is_not_pdf = release.release_type in ( + 'component', 'dataset', 'figure', 'graphic', -- cgit v1.2.3