From e72084a274145adc2dbcc8371bf6e4b26e129349 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Mon, 22 Nov 2021 13:53:48 -0800
Subject: add openalex directory source

Always run as day-specific ("TODAY") commands.

Add timeouts so command actually completes reasonably.
---
 Makefile                        | 25 +++++----------
 chocula/directories/__init__.py |  2 ++
 chocula/directories/openalex.py | 68 +++++++++++++++++++++++++++++++++++++++++
 3 files changed, 77 insertions(+), 18 deletions(-)
 create mode 100644 chocula/directories/openalex.py

diff --git a/Makefile b/Makefile
index 6d97255..477de20 100644
--- a/Makefile
+++ b/Makefile
@@ -29,26 +29,15 @@ test: lint ## Run all tests
 coverage: ## Run all tests with coverage
 	pipenv run pytest --cov
 
-data/container_stats.json:
-	mkdir -p data
-	cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
-	cat /tmp/container_issnl.tsv | parallel -j10 curl -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . > /tmp/container_stats.json
-	mv /tmp/container_stats.json data
-
 .PHONY: container-stats
-container-stats: data/container_stats.json
-	wc -l data/container_stats.json
+container-stats: data/$(TODAY)/container_stats.json  ## Summarize fatcat container counts
+	wc -l data/$(TODAY)/container_stats.json
 	@echo
 	@echo Done
 
-data/homepage_status.json:
-	pipenv run ./chocula.py export_urls | shuf > /tmp/chocula_urls_to_crawl.tsv
-	pipenv run parallel -j10 --bar --pipepart -a /tmp/chocula_urls_to_crawl.shuf.tsv ./check_issn_urls.py > /tmp/homepage_status.json
-	cp /tmp/homepage_status.json data/
-
 .PHONY: homepage-status
-homepage-status: data/homepage_status.json
-	wc -l data/homepage-status.json
+homepage-status: data/$(TODAY)/homepage_status.json  ## Check homepage "live"/"archive" existance for current database
+	wc -l data/$(TODAY)/homepage_status.json
 	@echo
 	@echo Done
 
@@ -111,9 +100,9 @@ data/$(TODAY)/homepage_status.json:
 	mv /tmp/url_status.json $@
 
 data/$(TODAY)/container_stats.json: data/container_export.json
-	cat data/container_export.json | jq .issnl -r | sort -u > /tmp/container_issnl.tsv
-	cat /tmp/container_issnl.tsv | parallel -j10 curl --fail -s 'https://fatcat.wiki/container/issnl/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json
-	cp /tmp/container_stats.json $@
+	cat data/container_export.json | jq .ident -r | sort -u > /tmp/container_ident.tsv
+	cat /tmp/container_ident.tsv | parallel -j10 curl --max-time 30 --fail -s 'https://fatcat.wiki/container/{}/stats.json' | jq -c . | pv -l > /tmp/container_stats.json
+	mv /tmp/container_stats.json $@
 
 .PHONY: upload-sources
 upload-sources: update-sources ## Upload most recent update-able sources to a new IA item
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
index ed306c0..0bd13f8 100644
--- a/chocula/directories/__init__.py
+++ b/chocula/directories/__init__.py
@@ -19,6 +19,7 @@ from chocula.directories.issn_meta import IssnMetaLoader
 from chocula.directories.australian_era import AustralianEraLoader
 from chocula.directories.awol import AwolLoader
 from chocula.directories.mag import MagLoader
+from chocula.directories.openalex import OpenAlexLoader
 
 # sort order roughly results in metadata prioritization
 ALL_CHOCULA_DIR_CLASSES = [
@@ -43,4 +44,5 @@ ALL_CHOCULA_DIR_CLASSES = [
     SimLoader,
     ZdbFizeLoader,
     MagLoader,
+    OpenAlexLoader,
 ]
diff --git a/chocula/directories/openalex.py b/chocula/directories/openalex.py
new file mode 100644
index 0000000..478c814
--- /dev/null
+++ b/chocula/directories/openalex.py
@@ -0,0 +1,68 @@
+from typing import Iterable, Optional
+import csv
+
+from chocula.util import clean_str, clean_issn
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class OpenAlexLoader(DirectoryLoader):
+    """
+    TSV Columns (from schema docs):
+
+        1	JournalId	long	PRIMARY KEY
+        2	Rank	uint	(DEPRECATED)
+        3	NormalizedName	string
+        4	DisplayName	string
+        5	Issn	string (ISSN-L)
+        6	Issns	JSON list
+        7       IsOa    bool
+        8       IsInDoaj    bool
+        9	Publisher	string
+        10	Webpage	string
+        11	PaperCount	long
+        12	PaperFamilyCount	long	(DEPRECATED)
+        13	CitationCount	long
+        14	CreatedDate	DateTime
+        15	UpdatedDate	DateTime
+
+    """
+
+    source_slug = "openalex"
+
+    def open_file(self) -> Iterable:
+        return csv.DictReader(
+            open(self.config.openalex.filepath, "r"),
+            delimiter="\t",
+            fieldnames=[
+                "JournalId",
+                "Rank",
+                "NormalizedName",
+                "DisplayName",
+                "Issn",
+                "Issns",
+                "IsOa",
+                "IsInDoaj",
+                "Publisher",
+                "Webpage",
+                "PaperCount",
+                "PaperFamilyCount",
+                "CitationCount",
+                "CreatedDate",
+                "UpdatedDate",
+            ],
+        )
+
+    def parse_record(self, record) -> Optional[DirectoryInfo]:
+        info = DirectoryInfo(
+            directory_slug=self.source_slug,
+            issnl=clean_issn(record["Issn"]),
+            custom_id=record["JournalId"],
+            name=clean_str(record["DisplayName"]),
+            publisher=clean_str(record["Publisher"]),
+        )
+        homepage = HomepageUrl.from_url(record["Webpage"] or "")
+        if homepage:
+            info.homepage_urls.append(homepage)
+
+        return info
-- 
cgit v1.2.3