diff options
author | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 01:11:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2020-06-03 01:11:23 -0700 |
commit | 8f0aa515d4dff2537eb9e1ab557da0b067f42250 (patch) | |
tree | 08d38c42b269292136af26122543b459918dc9da | |
parent | 9418d1d15ca809b7796085cf23afa0948cf956c4 (diff) | |
download | chocula-8f0aa515d4dff2537eb9e1ab557da0b067f42250.tar.gz chocula-8f0aa515d4dff2537eb9e1ab557da0b067f42250.zip |
scielo metadata import
-rw-r--r-- | chocula/directories/__init__.py | 3 | ||||
-rw-r--r-- | chocula/directories/scielo.py | 48 | ||||
-rw-r--r-- | sources.toml | 7 | ||||
-rw-r--r-- | tests/files/ISSN-to-ISSN-L.txt | 19 |
4 files changed, 76 insertions, 1 deletions
diff --git a/chocula/directories/__init__.py b/chocula/directories/__init__.py index 4bed696..a233a26 100644 --- a/chocula/directories/__init__.py +++ b/chocula/directories/__init__.py @@ -9,11 +9,12 @@ from chocula.directories.openapc import OpenAPCLoader from chocula.directories.road import RoadLoader from chocula.directories.sherpa_romeo import SherpaRomeoLoader from chocula.directories.sim import SimLoader +from chocula.directories.scielo import ScieloLoader from chocula.directories.szczepanski import SzczepanskiLoader from chocula.directories.wikidata import WikidataLoader ALL_CHOCULA_DIR_CLASSES = [ CrossrefLoader, DoajLoader, EntrezLoader,EzbLoader, GoldOALoader, NorwegianLoader, OpenAPCLoader, RoadLoader, SherpaRomeoLoader, - SzczepanskiLoader, WikidataLoader, SimLoader, + SzczepanskiLoader, WikidataLoader, SimLoader, ScieloLoader, ] diff --git a/chocula/directories/scielo.py b/chocula/directories/scielo.py new file mode 100644 index 0000000..247866b --- /dev/null +++ b/chocula/directories/scielo.py @@ -0,0 +1,48 @@ + +from typing import Iterable, Optional +import json + +from chocula.util import clean_str, clean_issn +from chocula.common import DirectoryLoader +from chocula.database import DirectoryInfo, HomepageUrl + + +class ScieloLoader(DirectoryLoader): + + source_slug = "scielo" + + def open_file(self) -> Iterable: + return open(self.config.scielo.filepath) + + def parse_record(self, line) -> Optional[DirectoryInfo]: + record = json.loads(line) + extra = dict( + status=clean_str(record.get('current_status')), + first_year=record.get('first_year'), + collection=record.get('collection_acronym'), + ) + for k in list(extra.keys()): + if extra[k] is None: + extra.pop(k) + country: Optional[str] = None + if record['publisher_country'] and len(record['publisher_country'][0]) == 2: + country = record['publisher_country'][0].lower() + info = DirectoryInfo( + directory_slug=self.source_slug, + issne=clean_issn(record.get('electronic_issn') or ''), + issnp=clean_issn(record.get('print_issn') or ''), + custom_id=clean_str(record.get('scielo_issn')), + name=clean_str(record.get('fulltitle')), + publisher=clean_str((record.get('publisher_name') or [''])[0]), + abbrev=clean_str(record['abbreviated_iso_title']), + platform='scielo', + langs=list(filter(lambda s: len(s) == 2, record['languages'])), + country=country, + extra=extra, + ) + if record['url']: + homepage = HomepageUrl.from_url(record['url']) + if homepage: + info.homepage_urls.append(homepage) + return info + diff --git a/sources.toml b/sources.toml index efde2d8..a91d8ee 100644 --- a/sources.toml +++ b/sources.toml @@ -57,6 +57,13 @@ filename = "norwegian_register.csv" original_url = "https://dbh.nsd.uib.no/publiseringskanaler/AlltidFerskListe" mirror_url = "https://archive.org/download/norwegian_register_journals" +[scielo] +date = "2020-05-05" +filename = "scielo.json" +mirror_url = "https://archive.org/download/scielo-journal-metadata/journals.20200505.json" +# not to be confused with scielo_journal_list.20200428.csv also floating +# around, but has no ISSNs + [lockss] filename = "kbart_LOCKSS.txt" original_url = "https://reports.lockss.org/kbart/kbart_LOCKSS.txt" diff --git a/tests/files/ISSN-to-ISSN-L.txt b/tests/files/ISSN-to-ISSN-L.txt index f44ea24..9c7b339 100644 --- a/tests/files/ISSN-to-ISSN-L.txt +++ b/tests/files/ISSN-to-ISSN-L.txt @@ -258,3 +258,22 @@ ISSN ISSN-L 0009-5532 0009-5532 0888-8817 0888-8817 0001-1452 0001-1452 +0102-7182 0102-7182 +1679-074X 1679-074X +1982-5471 1982-5471 +1516-1498 1516-1498 +1516-2567 1516-2567 +1413-0556 1413-0556 +0104-8023 0104-8023 +1413-0556 1413-0556 +1679-074X 1679-074X +0103-166X 0103-166X +0124-4906 0124-4906 +0104-3269 0104-3269 +1983-3288 1983-3288 +1516-8530 1516-8530 +1982-5471 1982-5471 +1809-8894 1809-8894 +0102-7182 0102-7182 +1806-6631 1806-6631 +1809-8894 1809-8894 |