From 427f25fb9a362348df644afae2f56124634ca67d Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@archive.org>
Date: Tue, 23 Jun 2020 17:23:37 -0700
Subject: vanished journal metadata importer

---
 chocula/directories/vanished_disapeared.py | 63 ++++++++++++++++++++++++++++++
 chocula/directories/vanished_inactive.py   | 50 ++++++++++++++++++++++++
 2 files changed, 113 insertions(+)
 create mode 100644 chocula/directories/vanished_disapeared.py
 create mode 100644 chocula/directories/vanished_inactive.py

(limited to 'chocula')

diff --git a/chocula/directories/vanished_disapeared.py b/chocula/directories/vanished_disapeared.py
new file mode 100644
index 0000000..a5e4c38
--- /dev/null
+++ b/chocula/directories/vanished_disapeared.py
@@ -0,0 +1,63 @@
+import csv
+from typing import Iterable, Optional
+
+from chocula.util import clean_str, clean_issn, parse_lang, parse_country
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class VanishedDisapearedLoader(DirectoryLoader):
+    """
+    Journal-level metadata from the "Vanished Journals" project. This is the
+    "disapeared" dataset, with many homepage URLs in wayback (web.archive.org).
+
+    CSV headers:
+        - Source
+        - If Identified by second source
+        - Journal Name
+        - ISSN
+        - E-ISSN
+        - URL
+        - Publisher
+        - blank
+        - Language(s)
+        - Country
+        - society_affiliation
+        - other_sci_affiliation
+        - Discipline
+        - Discipline Group
+        - Start Year
+        - End Year
+        - Last Year Online
+        - Actively Publishing
+        - Internet Archive Link
+        - Verified
+        - Comments
+        - The Keepers (archived)
+        - Archive Link
+        - Mikael (1 = agree with Lisa)
+    """
+
+    source_slug = "vanished_disapeared"
+
+    def open_file(self) -> Iterable:
+        return csv.DictReader(open(self.config.vanished_disapeared.filepath))
+
+    def parse_record(self, record) -> Optional[DirectoryInfo]:
+
+        info = DirectoryInfo(
+            directory_slug=self.source_slug,
+            raw_issn=clean_issn(record["ISSN"]),
+            issne=clean_issn(record["E-ISSN"]),
+            name=clean_str(record["Journal Name"]),
+            publisher=clean_str(record["Publisher"]),
+            langs=[parse_lang(record["Language(s)"])],
+            country=parse_country(record["Country"]),
+        )
+        homepage = HomepageUrl.from_url(record["Internet Archive Link"])
+        if homepage:
+            info.homepage_urls.append(homepage)
+        homepage = HomepageUrl.from_url(record["Archive Link"])
+        if homepage:
+            info.homepage_urls.append(homepage)
+        return info
diff --git a/chocula/directories/vanished_inactive.py b/chocula/directories/vanished_inactive.py
new file mode 100644
index 0000000..7996084
--- /dev/null
+++ b/chocula/directories/vanished_inactive.py
@@ -0,0 +1,50 @@
+import csv
+from typing import Iterable, Optional
+
+from chocula.util import clean_str, clean_issn, parse_lang, parse_country
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo
+
+
+class VanishedInactiveLoader(DirectoryLoader):
+    """
+    Journal-level metadata from the "Vanished Journals" project. This is the
+    "inactive" dataset.
+
+    CSV headers:
+
+        - Source
+        - Title
+        - Identifier
+        - Publisher
+        - Comment
+        - Language
+        - ISSN
+        - EISSN
+        - Keyword
+        - Start Year
+        - End Year
+        - Added on date
+        - Subjects
+        - Country
+        - Publication fee
+        - Further Information
+    """
+
+    source_slug = "vanished_inactive"
+
+    def open_file(self) -> Iterable:
+        return csv.DictReader(open(self.config.vanished_inactive.filepath))
+
+    def parse_record(self, record) -> Optional[DirectoryInfo]:
+
+        info = DirectoryInfo(
+            directory_slug=self.source_slug,
+            raw_issn=clean_issn(record["ISSN"]),
+            issne=clean_issn(record["EISSN"]),
+            name=clean_str(record["Title"]),
+            publisher=clean_str(record["Publisher"]),
+            langs=[parse_lang(record["Language"])],
+            country=parse_country(record["Country"]),
+        )
+        return info
-- 
cgit v1.2.3