aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2021-11-22 13:53:48 -0800
committerBryan Newbold <bnewbold@archive.org>2021-11-22 13:58:34 -0800
commite72084a274145adc2dbcc8371bf6e4b26e129349 (patch)
tree49a124fccf0089550d438c323e77937660e2806a
parent8d2f11fc0221a90732b3b13a3df1b16ea8acadeb (diff)
downloadchocula-e72084a274145adc2dbcc8371bf6e4b26e129349.tar.gz
chocula-e72084a274145adc2dbcc8371bf6e4b26e129349.zip
add openalex directory source
Always run as day-specific ("TODAY") commands. Add timeouts so command actually completes reasonably.
-rw-r--r--Makefile25
-rw-r--r--chocula/directories/__init__.py2
-rw-r--r--chocula/directories/openalex.py68
3 files changed, 77 insertions, 18 deletions
diff --git a/Makefile b/Makefile
index 6d97255..477de20 100644
--- a/Makefile
+++ b/Makefile
@@ -29,26 +29,15 @@ test: lint ## Run all tests
coverage: ## Run all tests with coverage
pipenv run pytest --cov
-data/container_stats.json:
- mkdir -p data
- cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
- cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json
- mv /tmp/container_stats.json data
-
.PHONY: container-stats
-container-stats: data/container_stats.json
- wc -l data/container_stats.json
+container-stats: data/$(TODAY)/container_stats.json ## Summarize fatcat container counts
+ wc -l data/$(TODAY)/container_stats.json
@echo
@echo Done
-data/homepage_status.json:
- pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv
- pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json
- cp /tmp/homepage_status.json data/
-
.PHONY: homepage-status
-homepage-status: data/homepage_status.json
- wc -l data/homepage-status.json
+homepage-status: data/$(TODAY)/homepage_status.json ## Check homepage "live"/"archive" existance for current database
+ wc -l data/$(TODAY)/homepage_status.json
@echo
@echo Done
@@ -111,9 +100,9 @@ data/$(TODAY)/homepage_status.json:
mv /tmp/url_status.json $@
data/$(TODAY)/container_stats.json: data/container_export.json
- cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
- cat /tmp/container_issnl.tsv | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json
- cp /tmp/container_stats.json $@
+ cat data/container_export.json | jq .ident -r | sort -u > /tmp/container_ident.tsv
+ cat /tmp/container_ident.tsv | parallel -j10 curl --max-time 30 --fail -s 'https://fatcat.wiki/container/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json
+ mv /tmp/container_stats.json $@
.PHONY: upload-sources
upload-sources: update-sources ## Upload most recent update-able sources to a new IA item
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
index ed306c0..0bd13f8 100644
--- a/chocula/directories/__init__.py
+++ b/chocula/directories/__init__.py
@@ -19,6 +19,7 @@ from chocula.directories.issn_meta import IssnMetaLoader
from chocula.directories.australian_era import AustralianEraLoader
from chocula.directories.awol import AwolLoader
from chocula.directories.mag import MagLoader
+from chocula.directories.openalex import OpenAlexLoader
# sort order roughly results in metadata prioritization
ALL_CHOCULA_DIR_CLASSES = [
@@ -43,4 +44,5 @@ ALL_CHOCULA_DIR_CLASSES = [
SimLoader,
ZdbFizeLoader,
MagLoader,
+ OpenAlexLoader,
]
diff --git a/chocula/directories/openalex.py b/chocula/directories/openalex.py
new file mode 100644
index 0000000..478c814
--- /dev/null
+++ b/chocula/directories/openalex.py
@@ -0,0 +1,68 @@
+from typing import Iterable, Optional
+import csv
+
+from chocula.util import clean_str, clean_issn
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class OpenAlexLoader(DirectoryLoader):
+ """
+ TSV Columns (from schema docs):
+
+ 1 JournalId long PRIMARY KEY
+ 2 Rank uint (DEPRECATED)
+ 3 NormalizedName string
+ 4 DisplayName string
+ 5 Issn string (ISSN-L)
+ 6 Issns JSON list
+ 7 IsOa bool
+ 8 IsInDoaj bool
+ 9 Publisher string
+ 10 Webpage string
+ 11 PaperCount long
+ 12 PaperFamilyCount long (DEPRECATED)
+ 13 CitationCount long
+ 14 CreatedDate DateTime
+ 15 UpdatedDate DateTime
+
+ """
+
+ source_slug = "openalex"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(
+ open(self.config.openalex.filepath, "r"),
+ delimiter="\t",
+ fieldnames=[
+ "JournalId",
+ "Rank",
+ "NormalizedName",
+ "DisplayName",
+ "Issn",
+ "Issns",
+ "IsOa",
+ "IsInDoaj",
+ "Publisher",
+ "Webpage",
+ "PaperCount",
+ "PaperFamilyCount",
+ "CitationCount",
+ "CreatedDate",
+ "UpdatedDate",
+ ],
+ )
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+ info = DirectoryInfo(
+ directory_slug=self.source_slug,
+ issnl=clean_issn(record["Issn"]),
+ custom_id=record["JournalId"],
+ name=clean_str(record["DisplayName"]),
+ publisher=clean_str(record["Publisher"]),
+ )
+ homepage = HomepageUrl.from_url(record["Webpage"] or "")
+ if homepage:
+ info.homepage_urls.append(homepage)
+
+ return info