aboutsummaryrefslogtreecommitdiffstats
path: root/chocula/directories/openalex.py
diff options
context:
space:
mode:
Diffstat (limited to 'chocula/directories/openalex.py')
-rw-r--r--chocula/directories/openalex.py68
1 files changed, 68 insertions, 0 deletions
diff --git a/chocula/directories/openalex.py b/chocula/directories/openalex.py
new file mode 100644
index 0000000..478c814
--- /dev/null
+++ b/chocula/directories/openalex.py
@@ -0,0 +1,68 @@
+from typing import Iterable, Optional
+import csv
+
+from chocula.util import clean_str, clean_issn
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class OpenAlexLoader(DirectoryLoader):
+ """
+ TSV Columns (from schema docs):
+
+ 1 JournalId long PRIMARY KEY
+ 2 Rank uint (DEPRECATED)
+ 3 NormalizedName string
+ 4 DisplayName string
+ 5 Issn string (ISSN-L)
+ 6 Issns JSON list
+ 7 IsOa bool
+ 8 IsInDoaj bool
+ 9 Publisher string
+ 10 Webpage string
+ 11 PaperCount long
+ 12 PaperFamilyCount long (DEPRECATED)
+ 13 CitationCount long
+ 14 CreatedDate DateTime
+ 15 UpdatedDate DateTime
+
+ """
+
+ source_slug = "openalex"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(
+ open(self.config.openalex.filepath, "r"),
+ delimiter="\t",
+ fieldnames=[
+ "JournalId",
+ "Rank",
+ "NormalizedName",
+ "DisplayName",
+ "Issn",
+ "Issns",
+ "IsOa",
+ "IsInDoaj",
+ "Publisher",
+ "Webpage",
+ "PaperCount",
+ "PaperFamilyCount",
+ "CitationCount",
+ "CreatedDate",
+ "UpdatedDate",
+ ],
+ )
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+ info = DirectoryInfo(
+ directory_slug=self.source_slug,
+ issnl=clean_issn(record["Issn"]),
+ custom_id=record["JournalId"],
+ name=clean_str(record["DisplayName"]),
+ publisher=clean_str(record["Publisher"]),
+ )
+ homepage = HomepageUrl.from_url(record["Webpage"] or "")
+ if homepage:
+ info.homepage_urls.append(homepage)
+
+ return info