From 9f53eb4c4fd4030965fe004184c803b41fa49b04 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Tue, 23 Jun 2020 11:39:10 -0700 Subject: new manual homepage source --- chocula/directories/__init__.py | 2 ++ chocula/directories/manual_homepages.py | 47 +++++++++++++++++++++++++++++++ sources.toml | 4 +++ tests/files/ISSN-to-ISSN-L.txt | 10 +++++++ tests/files/manual_longtail_homepages.tsv | 30 ++++++++++++++++++++ 5 files changed, 93 insertions(+) create mode 100644 chocula/directories/manual_homepages.py create mode 100644 tests/files/manual_longtail_homepages.tsv diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index 90e6f26..6f32c8e 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -11,6 +11,7 @@ from chocula.directories.sim import SimLoader from chocula.directories.scielo import ScieloLoader from chocula.directories.szczepanski import SzczepanskiLoader from chocula.directories.wikidata import WikidataLoader +from chocula.directories.manual_homepages import ManualHomepageLoader ALL_CHOCULA_DIR_CLASSES = [ CrossrefLoader, @@ -26,4 +27,5 @@ ALL_CHOCULA_DIR_CLASSES = [ WikidataLoader, SimLoader, ScieloLoader, + ManualHomepageLoader, ] diff --git a/chocula/directories/manual_homepages.py b/chocula/directories/manual_homepages.py new file mode 100644 index 0000000..3f84794 --- /dev/null +++ b/chocula/directories/manual_homepages.py @@ -0,0 +1,47 @@ +import csv +from typing import Iterable, Optional + +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class ManualHomepageLoader(DirectoryLoader): + + source_slug = "manual_homepages" + + def open_file(self) -> Iterable: + return csv.DictReader( + open(self.config.manual_homepages.filepath), delimiter="\t", + ) + + def parse_record(self, record) -> Optional[DirectoryInfo]: + """ + Most of this metadata comes from chocula/fatcat; we are only interested + in the homepage URLs. + + The "corrected titles" have been manually entered into fatcat directly. + + CSV columns: + - issnl + - issnp + - issne + - name + - corrected title + - publisher + - country + - lang + - release_count + - Homepage URL + - Inactive + """ + + info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issnl"],) + url = record["Homepage URL"] + if url is None or url.lower() == "unknown" or len(url) < 4: + return None + homepage = HomepageUrl.from_url(url) + if homepage: + info.homepage_urls.append(homepage) + if homepage is None: + return None + return info diff --git a/sources.toml b/sources.toml index a91d8ee..31b23c4 100644 --- a/sources.toml +++ b/sources.toml @@ -128,3 +128,7 @@ filename = "homepage_status.json" date = "2020-06-03" filename = "container_stats.json" +[manual_homepages] +date = "2020-05-05" +filename = "manual_longtail_homepages.tsv" +mirror_url = "https://archive.org/download/chocula-manual-hompages" diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index 9c7b339..28ccbb4 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -277,3 +277,13 @@ ISSN ISSN-L 0102-7182 0102-7182 1806-6631 1806-6631 1809-8894 1809-8894 +2456-9348 2456-9348 +2615-8744 2615-8744 +2621-1025 2621-1025 +2639-1724 2639-1724 +1069-5370 1069-5370 +2522-333X 2522-333X +2550-0996 2550-0996 +2503-3115 2503-3115 +2639-8915 2639-8915 +2198-7904 2198-7904 diff --git a/tests/files/manual_longtail_homepages.tsv b/tests/files/manual_longtail_homepages.tsv new file mode 100644 index 0000000..cdd04b2 --- /dev/null +++ b/tests/files/manual_longtail_homepages.tsv @@ -0,0 +1,30 @@ +issnl issnp issne name corrected title publisher country lang release_count Homepage URL Inactive +2456-9348 ¤ ¤ International journal of engineering technology research & management ¤ ¤ ¤ 0 http://www.ijetrm.com/ +2615-8744 ¤ ¤ Journal of sport and exercise science ¤ ¤ ¤ 0 https://journal.unesa.ac.id/index.php/jses +2621-1025 ¤ ¤ Maha Widya Bhuwana ¤ ¤ ¤ 0 unknown +2639-1724 ¤ ¤ International journal of nursing and hospital care ¤ ¤ ¤ 0 https://www.biocoreopen.org/ijnh/archive.php +1069-5370 ¤ ¤ Natural Resources and Environmental Issues ¤ ¤ ¤ 0 https://digitalcommons.usu.edu/nrei/ +2522-333X ¤ ¤ Mağallaẗ al-Ê¿ulÅ«m al-á¹­ibbiyyaẗ wa-al-á¹£aydalÄ?niyyaẗ Mağallaẗ al-ʿulūm al-ṭibbiyyaẗ wa-al-ṣaydalāniyyaẗ ¤ ¤ ¤ 0 http://ajsrp.com/journals/jmps +2550-0996 ¤ ¤ Sosioteknologi kreatif (Online) ¤ ¤ ¤ 0 https://e-jurnal.stieprasetiyamandiri.ac.id/index.php/sos Yes +2503-3115 ¤ ¤ Intelegensia : jurnal pendidikan dan pembelajaran ¤ ¤ ¤ 0 http://intelegensia.org/index.php/intelegensia +2639-8915 ¤ ¤ Journal of clinical research in anesthesiology ¤ ¤ ¤ 0 https://asclepiusopen.com/journal-of-clinical-research-in-anesthesiology/ +2198-7904 2198-7904 ¤ Publikationsreihe des Manufacturing Excellence Netzwerks Universitätsverlag der TU Berlin (Technische Universität Berlin) ¤ ¤ 0 https://www.ub.tu-berlin.de/publizieren/verlagsprogramm/collection/fak7-itm-mxaward/ +2527-6409 ¤ ¤ JRB (Jurnal Riau Biologia) ¤ ¤ ¤ 0 https://jrb.ejournal.unri.ac.id/index.php/JRB +2654-2617 ¤ ¤ Jurnal Teknik Informatika ¤ ¤ ¤ 0 unknown +2502-6860 ¤ ¤ Visipena ¤ ¤ ¤ 0 https://visipena.stkipgetsempena.ac.id/?journal=home +2606-6734 ¤ ¤ Histoire & archéologie de la Chartreuse de Bertaud ¤ ¤ ¤ 0 https://bertaud.hypotheses.org/ +2548-8503 ¤ ¤ Journal of lignocellulose technology ¤ ¤ ¤ 0 unknown +2543-6759 ¤ ¤ Wojny i Konflikty ¤ ¤ ¤ 0 unknown +2086-0366 ¤ ¤ Jurnal Education ¤ ¤ ¤ 0 unknown +2587-1501 ¤ ¤ Vokal ¤ ¤ ¤ 0 unknown +0128-2581 ¤ ¤ Malaysian Journal of Catalysis ¤ ¤ ¤ 0 unknown +2579-017X ¤ ¤ Archives of endocrinology and diabetes care ¤ ¤ ¤ 0 https://scientiaricerca.com/endc-archive.php +2457-0397 ¤ ¤ International journal of advanced engineering ¤ ¤ ¤ 0 http://www.bharatpublication.com/current-issue.php?jID=29/IJAE +2661-6564 ¤ ¤ Revista Científica HGDA (En línea) ¤ ¤ ¤ 0 unknown +2528-1402 ¤ ¤ Jurnal As-Salam ¤ ¤ ¤ 0 http://jurnal-assalam.org/index.php/JAS +2502-5279 ¤ ¤ Jurnal teknik mesin dan ilmu material ¤ ¤ ¤ 0 unknown +2639-1805 ¤ ¤ Archives of physical health and sports medicine ¤ ¤ ¤ 0 https://www.sryahwapublications.com/archives-of-physical-health-and-sports-medicine/ +2599-0055 ¤ ¤ Jurnal Mitra Kencana Keperawatan dan Kebidanan ¤ ¤ ¤ 0 unknown +2514-9709 ¤ ¤ Amity ¤ ¤ ¤ 0 https://amityjournal.leeds.ac.uk/ +2651-2580 ¤ ¤ Gsi journals serie b: advancements in business and economics ¤ ¤ ¤ 0 https://www.gsico.info/kopyasi-abeindexing +2027-0658 2027-0658 ¤ Derrotero ¤ co es 0 unknown -- cgit v1.2.3