From e72084a274145adc2dbcc8371bf6e4b26e129349 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 22 Nov 2021 13:53:48 -0800 Subject: add openalex directory source Always run as day-specific ("TODAY") commands. Add timeouts so command actually completes reasonably. --- Makefile | 25 +++++---------- chocula/directories/__init__.py | 2 ++ chocula/directories/openalex.py | 68 +++++++++++++++++++++++++++++++++++++++++ 3 files changed, 77 insertions(+), 18 deletions(-) create mode 100644 chocula/directories/openalex.py diff --git a/Makefile b/Makefile index 6d97255..477de20 100644 --- a/Makefile +++ b/Makefile @@ -29,26 +29,15 @@ test: lint ## Run all tests coverage: ## Run all tests with coverage pipenv run pytest --cov -data/container_stats.json: - mkdir -p data - cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv - cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json - mv /tmp/container_stats.json data - .PHONY: container-stats -container-stats: data/container_stats.json - wc -l data/container_stats.json +container-stats: data/$(TODAY)/container_stats.json ## Summarize fatcat container counts + wc -l data/$(TODAY)/container_stats.json @echo @echo Done -data/homepage_status.json: - pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv - pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json - cp /tmp/homepage_status.json data/ - .PHONY: homepage-status -homepage-status: data/homepage_status.json - wc -l data/homepage-status.json +homepage-status: data/$(TODAY)/homepage_status.json ## Check homepage "live"/"archive" existance for current database + wc -l data/$(TODAY)/homepage_status.json @echo @echo Done @@ -111,9 +100,9 @@ data/$(TODAY)/homepage_status.json: mv /tmp/url_status.json $@ data/$(TODAY)/container_stats.json: data/container_export.json - cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv - cat /tmp/container_issnl.tsv | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json - cp /tmp/container_stats.json $@ + cat data/container_export.json | jq .ident -r | sort -u > /tmp/container_ident.tsv + cat /tmp/container_ident.tsv | parallel -j10 curl --max-time 30 --fail -s 'https://fatcat.wiki/container/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json + mv /tmp/container_stats.json $@ .PHONY: upload-sources upload-sources: update-sources ## Upload most recent update-able sources to a new IA item diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index ed306c0..0bd13f8 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -19,6 +19,7 @@ from chocula.directories.issn_meta import IssnMetaLoader from chocula.directories.australian_era import AustralianEraLoader from chocula.directories.awol import AwolLoader from chocula.directories.mag import MagLoader +from chocula.directories.openalex import OpenAlexLoader # sort order roughly results in metadata prioritization ALL_CHOCULA_DIR_CLASSES = [ @@ -43,4 +44,5 @@ ALL_CHOCULA_DIR_CLASSES = [ SimLoader, ZdbFizeLoader, MagLoader, + OpenAlexLoader, ] diff --git a/chocula/directories/openalex.py b/chocula/directories/openalex.py new file mode 100644 index 0000000..478c814 --- /dev/null +++ b/chocula/directories/openalex.py @@ -0,0 +1,68 @@ +from typing import Iterable, Optional +import csv + +from chocula.util import clean_str, clean_issn +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class OpenAlexLoader(DirectoryLoader): + """ + TSV Columns (from schema docs): + + 1 JournalId long PRIMARY KEY + 2 Rank uint (DEPRECATED) + 3 NormalizedName string + 4 DisplayName string + 5 Issn string (ISSN-L) + 6 Issns JSON list + 7 IsOa bool + 8 IsInDoaj bool + 9 Publisher string + 10 Webpage string + 11 PaperCount long + 12 PaperFamilyCount long (DEPRECATED) + 13 CitationCount long + 14 CreatedDate DateTime + 15 UpdatedDate DateTime + + """ + + source_slug = "openalex" + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.config.openalex.filepath, "r"), + delimiter="\t", + fieldnames=[ + "JournalId", + "Rank", + "NormalizedName", + "DisplayName", + "Issn", + "Issns", + "IsOa", + "IsInDoaj", + "Publisher", + "Webpage", + "PaperCount", + "PaperFamilyCount", + "CitationCount", + "CreatedDate", + "UpdatedDate", + ], + ) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + info = DirectoryInfo( + directory_slug=self.source_slug, + issnl=clean_issn(record["Issn"]), + custom_id=record["JournalId"], + name=clean_str(record["DisplayName"]), + publisher=clean_str(record["Publisher"]), + ) + homepage = HomepageUrl.from_url(record["Webpage"] or "") + if homepage: + info.homepage_urls.append(homepage) + + return info -- cgit v1.2.3