aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@archive.org>2020-06-23 11:39:10 -0700
committerBryan Newbold <bnewbold@archive.org>2020-06-23 11:41:49 -0700
commit9f53eb4c4fd4030965fe004184c803b41fa49b04 (patch)
tree70532f2f32efdcb5326f7844560e6b3e8a5441f1
parentcd7fce808e60c09d184b7ec7e72570494a87d399 (diff)
downloadchocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.tar.gz
chocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.zip
new manual homepage source
-rw-r--r--chocula/directories/__init__.py2
-rw-r--r--chocula/directories/manual_homepages.py47
-rw-r--r--sources.toml4
-rw-r--r--tests/files/ISSN-to-ISSN-L.txt10
-rw-r--r--tests/files/manual_longtail_homepages.tsv30
5 files changed, 93 insertions, 0 deletions
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py
index 90e6f26..6f32c8e 100644
--- a/chocula/directories/__init__.py
+++ b/chocula/directories/__init__.py
@@ -11,6 +11,7 @@ from chocula.directories.sim import SimLoader
from chocula.directories.scielo import ScieloLoader
from chocula.directories.szczepanski import SzczepanskiLoader
from chocula.directories.wikidata import WikidataLoader
+from chocula.directories.manual_homepages import ManualHomepageLoader
ALL_CHOCULA_DIR_CLASSES = [
CrossrefLoader,
@@ -26,4 +27,5 @@ ALL_CHOCULA_DIR_CLASSES = [
WikidataLoader,
SimLoader,
ScieloLoader,
+ ManualHomepageLoader,
]
diff --git a/chocula/directories/manual_homepages.py b/chocula/directories/manual_homepages.py
new file mode 100644
index 0000000..3f84794
--- /dev/null
+++ b/chocula/directories/manual_homepages.py
@@ -0,0 +1,47 @@
+import csv
+from typing import Iterable, Optional
+
+from chocula.common import DirectoryLoader
+from chocula.database import DirectoryInfo, HomepageUrl
+
+
+class ManualHomepageLoader(DirectoryLoader):
+
+ source_slug = "manual_homepages"
+
+ def open_file(self) -> Iterable:
+ return csv.DictReader(
+ open(self.config.manual_homepages.filepath), delimiter="\t",
+ )
+
+ def parse_record(self, record) -> Optional[DirectoryInfo]:
+ """
+ Most of this metadata comes from chocula/fatcat; we are only interested
+ in the homepage URLs.
+
+ The "corrected titles" have been manually entered into fatcat directly.
+
+ CSV columns:
+ - issnl
+ - issnp
+ - issne
+ - name
+ - corrected title
+ - publisher
+ - country
+ - lang
+ - release_count
+ - Homepage URL
+ - Inactive
+ """
+
+ info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issnl"],)
+ url = record["Homepage URL"]
+ if url is None or url.lower() == "unknown" or len(url) < 4:
+ return None
+ homepage = HomepageUrl.from_url(url)
+ if homepage:
+ info.homepage_urls.append(homepage)
+ if homepage is None:
+ return None
+ return info
diff --git a/sources.toml b/sources.toml
index a91d8ee..31b23c4 100644
--- a/sources.toml
+++ b/sources.toml
@@ -128,3 +128,7 @@ filename = "homepage_status.json"
date = "2020-06-03"
filename = "container_stats.json"
+[manual_homepages]
+date = "2020-05-05"
+filename = "manual_longtail_homepages.tsv"
+mirror_url = "https://archive.org/download/chocula-manual-hompages"
diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt
index 9c7b339..28ccbb4 100644
--- a/tests/files/ISSN-to-ISSN-L.txt
+++ b/tests/files/ISSN-to-ISSN-L.txt
@@ -277,3 +277,13 @@ ISSN ISSN-L
0102-7182 0102-7182
1806-6631 1806-6631
1809-8894 1809-8894
+2456-9348 2456-9348
+2615-8744 2615-8744
+2621-1025 2621-1025
+2639-1724 2639-1724
+1069-5370 1069-5370
+2522-333X 2522-333X
+2550-0996 2550-0996
+2503-3115 2503-3115
+2639-8915 2639-8915
+2198-7904 2198-7904
diff --git a/tests/files/manual_longtail_homepages.tsv b/tests/files/manual_longtail_homepages.tsv
new file mode 100644
index 0000000..cdd04b2
--- /dev/null
+++ b/tests/files/manual_longtail_homepages.tsv
@@ -0,0 +1,30 @@
+issnl issnp issne name corrected title publisher country lang release_count Homepage URL Inactive
+2456-9348 ¤ ¤ International journal of engineering technology research & management ¤ ¤ ¤ 0 http://www.ijetrm.com/
+2615-8744 ¤ ¤ Journal of sport and exercise science ¤ ¤ ¤ 0 https://journal.unesa.ac.id/index.php/jses
+2621-1025 ¤ ¤ Maha Widya Bhuwana ¤ ¤ ¤ 0 unknown
+2639-1724 ¤ ¤ International journal of nursing and hospital care ¤ ¤ ¤ 0 https://www.biocoreopen.org/ijnh/archive.php
+1069-5370 ¤ ¤ Natural Resources and Environmental Issues ¤ ¤ ¤ 0 https://digitalcommons.usu.edu/nrei/
+2522-333X ¤ ¤ Mağallaẗ al-Ê¿ulÅ«m al-á¹­ibbiyyaẗ wa-al-á¹£aydalÄ?niyyaẗ Mağallaẗ al-ʿulūm al-ṭibbiyyaẗ wa-al-ṣaydalāniyyaẗ ¤ ¤ ¤ 0 http://ajsrp.com/journals/jmps
+2550-0996 ¤ ¤ Sosioteknologi kreatif (Online) ¤ ¤ ¤ 0 https://e-jurnal.stieprasetiyamandiri.ac.id/index.php/sos Yes
+2503-3115 ¤ ¤ Intelegensia : jurnal pendidikan dan pembelajaran ¤ ¤ ¤ 0 http://intelegensia.org/index.php/intelegensia
+2639-8915 ¤ ¤ Journal of clinical research in anesthesiology ¤ ¤ ¤ 0 https://asclepiusopen.com/journal-of-clinical-research-in-anesthesiology/
+2198-7904 2198-7904 ¤ Publikationsreihe des Manufacturing Excellence Netzwerks Universitätsverlag der TU Berlin (Technische Universität Berlin) ¤ ¤ 0 https://www.ub.tu-berlin.de/publizieren/verlagsprogramm/collection/fak7-itm-mxaward/
+2527-6409 ¤ ¤ JRB (Jurnal Riau Biologia) ¤ ¤ ¤ 0 https://jrb.ejournal.unri.ac.id/index.php/JRB
+2654-2617 ¤ ¤ Jurnal Teknik Informatika ¤ ¤ ¤ 0 unknown
+2502-6860 ¤ ¤ Visipena ¤ ¤ ¤ 0 https://visipena.stkipgetsempena.ac.id/?journal=home
+2606-6734 ¤ ¤ Histoire & archéologie de la Chartreuse de Bertaud ¤ ¤ ¤ 0 https://bertaud.hypotheses.org/
+2548-8503 ¤ ¤ Journal of lignocellulose technology ¤ ¤ ¤ 0 unknown
+2543-6759 ¤ ¤ Wojny i Konflikty ¤ ¤ ¤ 0 unknown
+2086-0366 ¤ ¤ Jurnal Education ¤ ¤ ¤ 0 unknown
+2587-1501 ¤ ¤ Vokal ¤ ¤ ¤ 0 unknown
+0128-2581 ¤ ¤ Malaysian Journal of Catalysis ¤ ¤ ¤ 0 unknown
+2579-017X ¤ ¤ Archives of endocrinology and diabetes care ¤ ¤ ¤ 0 https://scientiaricerca.com/endc-archive.php
+2457-0397 ¤ ¤ International journal of advanced engineering ¤ ¤ ¤ 0 http://www.bharatpublication.com/current-issue.php?jID=29/IJAE
+2661-6564 ¤ ¤ Revista Científica HGDA (En línea) ¤ ¤ ¤ 0 unknown
+2528-1402 ¤ ¤ Jurnal As-Salam ¤ ¤ ¤ 0 http://jurnal-assalam.org/index.php/JAS
+2502-5279 ¤ ¤ Jurnal teknik mesin dan ilmu material ¤ ¤ ¤ 0 unknown
+2639-1805 ¤ ¤ Archives of physical health and sports medicine ¤ ¤ ¤ 0 https://www.sryahwapublications.com/archives-of-physical-health-and-sports-medicine/
+2599-0055 ¤ ¤ Jurnal Mitra Kencana Keperawatan dan Kebidanan ¤ ¤ ¤ 0 unknown
+2514-9709 ¤ ¤ Amity ¤ ¤ ¤ 0 https://amityjournal.leeds.ac.uk/
+2651-2580 ¤ ¤ Gsi journals serie b: advancements in business and economics ¤ ¤ ¤ 0 https://www.gsico.info/kopyasi-abeindexing
+2027-0658 2027-0658 ¤ Derrotero ¤ co es 0 unknown