diff options
| author | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 11:39:10 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-23 11:41:49 -0700 | 
| commit | 9f53eb4c4fd4030965fe004184c803b41fa49b04 (patch) | |
| tree | 70532f2f32efdcb5326f7844560e6b3e8a5441f1 | |
| parent | cd7fce808e60c09d184b7ec7e72570494a87d399 (diff) | |
| download | chocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.tar.gz chocula-9f53eb4c4fd4030965fe004184c803b41fa49b04.zip | |
new manual homepage source
| -rw-r--r-- | chocula/directories/__init__.py | 2 | ||||
| -rw-r--r-- | chocula/directories/manual_homepages.py | 47 | ||||
| -rw-r--r-- | sources.toml | 4 | ||||
| -rw-r--r-- | tests/files/ISSN-to-ISSN-L.txt | 10 | ||||
| -rw-r--r-- | tests/files/manual_longtail_homepages.tsv | 30 | 
5 files changed, 93 insertions, 0 deletions
| diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index 90e6f26..6f32c8e 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -11,6 +11,7 @@ from chocula.directories.sim import SimLoader  from chocula.directories.scielo import ScieloLoader  from chocula.directories.szczepanski import SzczepanskiLoader  from chocula.directories.wikidata import WikidataLoader +from chocula.directories.manual_homepages import ManualHomepageLoader  ALL_CHOCULA_DIR_CLASSES = [      CrossrefLoader, @@ -26,4 +27,5 @@ ALL_CHOCULA_DIR_CLASSES = [      WikidataLoader,      SimLoader,      ScieloLoader, +    ManualHomepageLoader,  ] diff --git a/chocula/directories/manual_homepages.py b/chocula/directories/manual_homepages.py new file mode 100644 index 0000000..3f84794 --- /dev/null +++ b/chocula/directories/manual_homepages.py @@ -0,0 +1,47 @@ +import csv +from typing import Iterable, Optional + +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class ManualHomepageLoader(DirectoryLoader): + +    source_slug = "manual_homepages" + +    def open_file(self) -> Iterable: +        return csv.DictReader( +            open(self.config.manual_homepages.filepath), delimiter="\t", +        ) + +    def parse_record(self, record) -> Optional[DirectoryInfo]: +        """ +        Most of this metadata comes from chocula/fatcat; we are only interested +        in the homepage URLs. + +        The "corrected titles" have been manually entered into fatcat directly. + +        CSV columns: +        - issnl +        - issnp +        - issne +        - name +        - corrected title +        - publisher +        - country +        - lang +        - release_count +        - Homepage URL +        - Inactive +        """ + +        info = DirectoryInfo(directory_slug=self.source_slug, issnl=record["issnl"],) +        url = record["Homepage URL"] +        if url is None or url.lower() == "unknown" or len(url) < 4: +            return None +        homepage = HomepageUrl.from_url(url) +        if homepage: +            info.homepage_urls.append(homepage) +        if homepage is None: +            return None +        return info diff --git a/sources.toml b/sources.toml index a91d8ee..31b23c4 100644 --- a/sources.toml +++ b/sources.toml @@ -128,3 +128,7 @@ filename = "homepage_status.json"  date = "2020-06-03"  filename = "container_stats.json" +[manual_homepages] +date = "2020-05-05" +filename = "manual_longtail_homepages.tsv" +mirror_url = "https://archive.org/download/chocula-manual-hompages" diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index 9c7b339..28ccbb4 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -277,3 +277,13 @@ ISSN	ISSN-L  0102-7182 	0102-7182  1806-6631 	1806-6631  1809-8894 	1809-8894 +2456-9348	2456-9348 +2615-8744	2615-8744 +2621-1025	2621-1025 +2639-1724	2639-1724 +1069-5370	1069-5370 +2522-333X	2522-333X +2550-0996	2550-0996 +2503-3115	2503-3115 +2639-8915	2639-8915 +2198-7904	2198-7904 diff --git a/tests/files/manual_longtail_homepages.tsv b/tests/files/manual_longtail_homepages.tsv new file mode 100644 index 0000000..cdd04b2 --- /dev/null +++ b/tests/files/manual_longtail_homepages.tsv @@ -0,0 +1,30 @@ +issnl	issnp	issne	name	corrected title	publisher	country	lang	release_count	Homepage URL	Inactive
 +2456-9348	¤	¤	International journal of engineering technology research & management		¤	¤	¤	0	http://www.ijetrm.com/	
 +2615-8744	¤	¤	Journal of sport and exercise science		¤	¤	¤	0	https://journal.unesa.ac.id/index.php/jses	
 +2621-1025	¤	¤	Maha Widya Bhuwana		¤	¤	¤	0	unknown	
 +2639-1724	¤	¤	International journal of nursing and hospital care		¤	¤	¤	0	https://www.biocoreopen.org/ijnh/archive.php	
 +1069-5370	¤	¤	Natural Resources and Environmental Issues		¤	¤	¤	0	https://digitalcommons.usu.edu/nrei/	
 +2522-333X	¤	¤	Mağallaẗ al-Ê¿ulÅ«m al-á¹ibbiyyaẗ wa-al-á¹£aydalÄ?niyyaẗ	Mağallaẗ al-ʿulūm al-ṭibbiyyaẗ wa-al-ṣaydalāniyyaẗ	¤	¤	¤	0	http://ajsrp.com/journals/jmps	
 +2550-0996	¤	¤	Sosioteknologi kreatif (Online)		¤	¤	¤	0	https://e-jurnal.stieprasetiyamandiri.ac.id/index.php/sos	Yes
 +2503-3115	¤	¤	Intelegensia : jurnal pendidikan dan pembelajaran		¤	¤	¤	0	http://intelegensia.org/index.php/intelegensia	
 +2639-8915	¤	¤	Journal of clinical research in anesthesiology		¤	¤	¤	0	https://asclepiusopen.com/journal-of-clinical-research-in-anesthesiology/	
 +2198-7904	2198-7904	¤	Publikationsreihe des Manufacturing Excellence Netzwerks		Universitätsverlag der TU Berlin (Technische Universität Berlin)	¤	¤	0	https://www.ub.tu-berlin.de/publizieren/verlagsprogramm/collection/fak7-itm-mxaward/	
 +2527-6409	¤	¤	JRB (Jurnal Riau Biologia)		¤	¤	¤	0	https://jrb.ejournal.unri.ac.id/index.php/JRB	
 +2654-2617	¤	¤	Jurnal Teknik Informatika		¤	¤	¤	0	unknown	
 +2502-6860	¤	¤	Visipena		¤	¤	¤	0	https://visipena.stkipgetsempena.ac.id/?journal=home	
 +2606-6734	¤	¤	Histoire & archéologie de la Chartreuse de Bertaud		¤	¤	¤	0	https://bertaud.hypotheses.org/	
 +2548-8503	¤	¤	Journal of lignocellulose technology		¤	¤	¤	0	unknown	
 +2543-6759	¤	¤	Wojny i Konflikty		¤	¤	¤	0	unknown	
 +2086-0366	¤	¤	Jurnal Education		¤	¤	¤	0	unknown	
 +2587-1501	¤	¤	Vokal		¤	¤	¤	0	unknown	
 +0128-2581	¤	¤	Malaysian Journal of Catalysis		¤	¤	¤	0	unknown	
 +2579-017X	¤	¤	Archives of endocrinology and diabetes care		¤	¤	¤	0	https://scientiaricerca.com/endc-archive.php	
 +2457-0397	¤	¤	International journal of advanced engineering		¤	¤	¤	0	http://www.bharatpublication.com/current-issue.php?jID=29/IJAE	
 +2661-6564	¤	¤	Revista Científica HGDA (En línea)		¤	¤	¤	0	unknown	
 +2528-1402	¤	¤	Jurnal As-Salam		¤	¤	¤	0	http://jurnal-assalam.org/index.php/JAS	
 +2502-5279	¤	¤	Jurnal teknik mesin dan ilmu material		¤	¤	¤	0	unknown	
 +2639-1805	¤	¤	Archives of physical health and sports medicine		¤	¤	¤	0	https://www.sryahwapublications.com/archives-of-physical-health-and-sports-medicine/	
 +2599-0055	¤	¤	Jurnal Mitra Kencana Keperawatan dan Kebidanan		¤	¤	¤	0	unknown	
 +2514-9709	¤	¤	Amity		¤	¤	¤	0	https://amityjournal.leeds.ac.uk/	
 +2651-2580	¤	¤	Gsi journals serie b: advancements in business and economics		¤	¤	¤	0	https://www.gsico.info/kopyasi-abeindexing	
 +2027-0658	2027-0658	¤	Derrotero		¤	co	es	0	unknown	
 | 
