aboutsummaryrefslogtreecommitdiffstats
path: root/chocula
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 17:23:37 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 17:23:37 -0700
commit427f25fb9a362348df644afae2f56124634ca67d (patch)
tree1fc8ef796abdaac009121bca2093214e4ff77578 /chocula
parent571b1f77e9375c7bab5ccbe8ae41c60dd2c64779 (diff)
downloadchocula-427f25fb9a362348df644afae2f56124634ca67d.tar.gz
chocula-427f25fb9a362348df644afae2f56124634ca67d.zip
vanished journal metadata importer
Diffstat (limited to 'chocula')
-rw-r--r--chocula/directories/vanished_disapeared.py63
-rw-r--r--chocula/directories/vanished_inactive.py50
2 files changed, 113 insertions, 0 deletions
diff --git a/chocula/directories/vanished_disapeared.py b/chocula/directories/vanished_disapeared.py
new file mode 100644
index 0000000..a5e4c38
--- /dev/null
+++ b/chocula/directories/vanished_disapeared.py
@@ -0,0 +1,63 @@
+import csv
+from typing import Iterable, Optional
+
+from chocula.util import clean_str, clean_issn, parse_lang, parse_country
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class VanishedDisapearedLoader(DirectoryLoader):
+ """
+ Journal-level metadata from the "Vanished Journals" project. This is the
+ "disapeared" dataset, with many homepage URLs in wayback (web.archive.org).
+
+ CSV headers:
+ - Source
+ - If Identified by second source
+ - Journal Name
+ - ISSN
+ - E-ISSN
+ - URL
+ - Publisher
+ - blank
+ - Language(s)
+ - Country
+ - society_affiliation
+ - other_sci_affiliation
+ - Discipline
+ - Discipline Group
+ - Start Year
+ - End Year
+ - Last Year Online
+ - Actively Publishing
+ - Internet Archive Link
+ - Verified
+ - Comments
+ - The Keepers (archived)
+ - Archive Link
+ - Mikael (1 = agree with Lisa)
+ """
+
+ source_slug = "vanished_disapeared"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(open(self.config.vanished_disapeared.filepath))
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+
+ info = DirectoryInfo(
+ directory_slug=self.source_slug,
+ raw_issn=clean_issn(record["ISSN"]),
+ issne=clean_issn(record["E-ISSN"]),
+ name=clean_str(record["Journal Name"]),
+ publisher=clean_str(record["Publisher"]),
+ langs=[parse_lang(record["Language(s)"])],
+ country=parse_country(record["Country"]),
+ )
+ homepage = HomepageUrl.from_url(record["Internet Archive Link"])
+ if homepage:
+ info.homepage_urls.append(homepage)
+ homepage = HomepageUrl.from_url(record["Archive Link"])
+ if homepage:
+ info.homepage_urls.append(homepage)
+ return info
diff --git a/chocula/directories/vanished_inactive.py b/chocula/directories/vanished_inactive.py
new file mode 100644
index 0000000..7996084
--- /dev/null
+++ b/chocula/directories/vanished_inactive.py
@@ -0,0 +1,50 @@
+import csv
+from typing import Iterable, Optional
+
+from chocula.util import clean_str, clean_issn, parse_lang, parse_country
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo
+
+
+class VanishedInactiveLoader(DirectoryLoader):
+ """
+ Journal-level metadata from the "Vanished Journals" project. This is the
+ "inactive" dataset.
+
+ CSV headers:
+
+ - Source
+ - Title
+ - Identifier
+ - Publisher
+ - Comment
+ - Language
+ - ISSN
+ - EISSN
+ - Keyword
+ - Start Year
+ - End Year
+ - Added on date
+ - Subjects
+ - Country
+ - Publication fee
+ - Further Information
+ """
+
+ source_slug = "vanished_inactive"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(open(self.config.vanished_inactive.filepath))
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+
+ info = DirectoryInfo(
+ directory_slug=self.source_slug,
+ raw_issn=clean_issn(record["ISSN"]),
+ issne=clean_issn(record["EISSN"]),
+ name=clean_str(record["Title"]),
+ publisher=clean_str(record["Publisher"]),
+ langs=[parse_lang(record["Language"])],
+ country=parse_country(record["Country"]),
+ )
+ return info