diff options
| -rwxr-xr-x | python/fatcat_import.py | 6 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 143 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms.py | 183 | ||||
| -rw-r--r-- | python/tests/files/journal_extra_metadata.snip.csv | 10 | ||||
| -rw-r--r-- | python/tests/files/journal_metadata.sample.json | 20 | ||||
| -rw-r--r-- | python/tests/import_journal_metadata.py | 18 | 
6 files changed, 226 insertions, 154 deletions
| diff --git a/python/fatcat_import.py b/python/fatcat_import.py index a47aa175..8d952067 100755 --- a/python/fatcat_import.py +++ b/python/fatcat_import.py @@ -24,7 +24,7 @@ def run_orcid(args):  def run_journal_metadata(args):      fii = JournalMetadataImporter(args.api,          edit_batch_size=args.batch_size) -    CsvLinePusher(fii, args.csv_file).run() +    JsonLinePusher(fii, args.json_file).run()  def run_matched(args):      fmi = MatchedImporter(args.api, @@ -93,8 +93,8 @@ def main():          func=run_journal_metadata,          auth_var="FATCAT_AUTH_WORKER_JOURNAL_METADATA",      ) -    sub_journal_metadata.add_argument('csv_file', -        help="Journal ISSN CSV metadata file to import from (or stdin)", +    sub_journal_metadata.add_argument('json_file', +        help="Journal JSON metadata file to import from (or stdin)",          default=sys.stdin, type=argparse.FileType('r'))      sub_journal_metadata.add_argument('--batch-size',          help="size of batch to send", diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index cf3971b5..7f6b1ee8 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -28,90 +28,9 @@ def truthy(s):  class JournalMetadataImporter(EntityImporter):      """      Imports journal metadata ("containers") by ISSN, currently from a custom -    (data munged) .csv file format - -    CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): - -        ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - - -    'extra' fields: - -        doaj -            as_of: datetime of most recent check; if not set, not actually in DOAJ -            seal: bool -            work_level: bool (are work-level publications deposited with DOAJ?) -            archiving: array, can include 'library' or 'other' -        road -            as_of: datetime of most recent check; if not set, not actually in ROAD -        pubmed (TODO: delete?) -            as_of: datetime of most recent check; if not set, not actually indexed in pubmed -        norwegian (TODO: drop this?) -            as_of: datetime of most recent check; if not set, not actually indexed in pubmed -            id (integer) -            level (integer; 0-2) -        kbart -            lockss -                year_rle -                volume_rle -            portico -                ... -            clockss -                ... -        sherpa_romeo -            color -        jstor -            year_rle -            volume_rle -        scopus -            id -            TODO: print/electronic distinction? -        wos -            id -        doi -            crossref_doi: DOI of the title in crossref (if exists) -            prefixes: array of strings (DOI prefixes, up to the '/'; any registrar, not just Crossref) -        ia -            sim -                nap_id -                year_rle -                volume_rle -            longtail: boolean -            homepage -                as_of: datetime of last attempt -                url -                status: HTTP/heritrix status of homepage crawl - -        issnp: string -        issne: string -        coden: string -        abbrev: string -        oclc_id: string (TODO: lookup?) -        lccn_id: string (TODO: lookup?) -        dblb_id: string -        default_license: slug -        original_name: native name (if name is translated) -        platform: hosting platform: OJS, wordpress, scielo, etc -        mimetypes: array of strings (eg, 'application/pdf', 'text/html') -        first_year: year (integer) -        last_year: if publishing has stopped -        primary_language: single ISO code, or 'mixed' -        languages: array of ISO codes -        region: TODO: continent/world-region -        nation: shortcode of nation -        discipline: TODO: highest-level subject; "life science", "humanities", etc -        field: TODO: narrower description of field -        subjects: TODO? -        url: homepage -        is_oa: boolean. If true, can assume all releases under this container are "Open Access" -        TODO: domains, if exclusive? -        TODO: fulltext_regex, if a known pattern? - -    For KBART, etc: -        We "over-count" on the assumption that "in-progress" status works will soon actually be preserved. -        year and volume spans are run-length-encoded arrays, using integers: -            - if an integer, means that year is preserved -            - if an array of length 2, means everything between the two numbers (inclusive) is preserved +    munged JSON format (see ../extra/journal_metadata/). + +    See guide for details on the many 'extra' fields used here.      """      def __init__(self, api, **kwargs): @@ -125,34 +44,50 @@ class JournalMetadataImporter(EntityImporter):              editgroup_extra=eg_extra)      def want(self, raw_record): -        if raw_record.get('ISSN-L'): +        if raw_record.get('issnl'):              return True          return False      def parse_record(self, row):          """ -        row is a python dict (parsed from CSV). +        row is a python dict (parsed from JSON). +          returns a ContainerEntity (or None if invalid or couldn't parse)          """ -        title = or_none(row['title']) -        issnl = or_none(row['ISSN-L']) -        if title is None or issnl is None: -            return None -        extra = dict( -            in_doaj=truthy(row['in_doaj']), -            in_road=truthy(row['in_road']), -            in_norwegian=truthy(row['in_norwegian']), -            language=or_none(row['lang']), -            url=or_none(row['url']), -            ISSNp=or_none(row['ISSN-print']), -            ISSNe=or_none(row['ISSN-electronic']), -            is_oa=truthy(row['is_oa']), -            is_kept=truthy(row['is_kept']), -        ) + +        extra = dict() +        for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev', +            'coden', 'aliases', 'original_name', 'first_year', 'last_year', +            'platform', 'default_license', 'road', 'mimetypes', +            'sherpa_romeo', 'kbart'): +            if row.get(key): +                extra[key] = row[key] +        # TODO: not including for now: norwegian, dois/crossref, ia + +        extra_doaj = dict() +        if row.get('doaj'): +            if row['doaj'].get('as_of'): +                extra_doaj['as_of'] = row['doaj']['as_of'] +            if row['doaj'].get('works'): +                extra_doaj['works'] = row['doaj']['works'] +        if extra_doaj: +            extra['doaj'] = extra_doaj + +        extra_ia = dict() +        # TODO: would like an ia.longtail_ia flag +        if row.get('sim'): +            extra_ia['sim'] = { +                'year_spans': row['sim']['year_spans'], +            } +        if extra_ia: +            extra['ia'] = extra_ia +          ce = fatcat_client.ContainerEntity( -            issnl=issnl, -            name=clean(title), -            publisher=or_none(clean(row['publisher'])), +            issnl=row['issnl'], +            container_type=None, # TODO +            name=clean(row.get('name')), +            publisher=clean(row.get('publisher')), +            wikidata_qid=None, # TODO              extra=extra)          return ce diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index 2493b1ab..a85c877c 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -19,7 +19,22 @@ def entity_from_json(json_str, entity_type):      thing.data = json_str      return ac.deserialize(thing, entity_type) -def release_to_elasticsearch(release): +def check_kbart(year, archive): +    if not archive or not archive.get('year_spans'): +        return None +    for span in archive['year_spans']: +        if year >= span[0] and year <= span[1]: +            return True +    return False + +def test_check_kbart(): + +    assert check_kbart(1990, dict(year_spans=[[2000, 2000]])) == False +    assert check_kbart(2000, dict(year_spans=[[2000, 2000]])) == True +    assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1990, 2000]])) == False +    assert check_kbart(1950, dict(year_spans=[[1900, 1920], [1930, 2000]])) == True + +def release_to_elasticsearch(entity):      """      Converts from an entity model/schema to elasticsearch oriented schema. @@ -27,15 +42,16 @@ def release_to_elasticsearch(release):      Raises exception on error (never returns None)      """ -    if release.state in ('redirect', 'deleted'): +    if entity.state in ('redirect', 'deleted'):          return dict( -            ident = release.ident, -            state = release.state, +            ident = entity.ident, +            state = entity.state,          ) -    elif release.state != 'active': -        raise ValueError("Unhandled release state: {}".format(release.state)) +    elif entity.state != 'active': +        raise ValueError("Unhandled entity state: {}".format(entity.state))      # First, the easy ones (direct copy) +    release = entity      t = dict(          ident = release.ident,          state = release.state, @@ -57,11 +73,14 @@ def release_to_elasticsearch(release):      )      is_oa = None +    is_preserved = None      is_longtail_oa = None      in_kbart = None +    in_jstor = False      in_web = False      in_dweb = False      in_ia = False +    in_ia_sim = False      in_shadow = False      if release.release_date: @@ -88,19 +107,35 @@ def release_to_elasticsearch(release):          t['container_issnl'] = container.issnl          t['container_type'] = container.container_type          if container.extra: -            if container.extra.get('is_oa') or container.extra.get('in_doaj'): +            c_extra = container.extra +            if c_extra.get('kbart') and release.year: +                in_jstor = check_kbart(release.year, c_extra['kbart'].get('jstor')) +                in_kbart = in_jstor +                for archive in ('portico', 'lockss', 'clockss'): +                    in_kbart = in_kbart or check_kbart(release.year, c_extra['kbart'].get(archive)) + +            if c_extra.get('ia'): +                if c_extra['ia'].get('sim') and release.year: +                    in_ia_sim = check_kbart(release, c_extra['ia']['sim'].get('year_spans')) +                if c_extra['ia'].get('longtail_oa'): +                    is_longtail_oa = True +            if c_extra.get('sherpa_romeo'): +                if c_extra['sherpa_romeo'].get('color') == 'white': +                    is_oa = False +            if c_extra.get('default_license') and c_extra.get('default_license').startswith('CC-'):                  is_oa = True -            if container.extra.get('in_kbart'): -                # TODO: better KBART check goes here -                in_kbart = True -            if container.extra.get('ia'): -                # TODO: container longtail check goes here -                # TODO: sim/microfilm check goes here -                pass -            # TODO: SHERPA/Romeo goes here +            if c_extra.get('doaj'): +                if c_extra['doaj'].get('as_of'): +                    is_oa = True +            if c_extra.get('road'): +                if c_extra['road'].get('as_of'): +                    is_oa = True      else:          t['publisher'] = release.publisher +    if release.jstor_id or (release.doi and release.doi.startswith('10.2307/')): +        in_jstor = True +      files = release.files or []      t['file_count'] = len(files)      t['fileset_count'] = len(release.filesets or []) @@ -118,13 +153,15 @@ def release_to_elasticsearch(release):              if url.url.lower().startswith('http'):                  in_web = True              if url.rel in ('dweb', 'p2p', 'ipfs', 'dat', 'torrent'): -                # TODO: not sure what rel will be +                # not sure what rel will be for this stuff                  in_dweb = True              if is_pdf:                  any_pdf_url = url.url              if is_pdf and url.rel in ('webarchive', 'repository') and is_pdf:                  is_preserved = True                  good_pdf_url = url.url +            if '//www.jstor.org/' in url.url: +                in_jstor = True              if '//web.archive.org/' in url.url or '//archive.org/' in url.url:                  in_ia = True                  if is_pdf: @@ -141,18 +178,15 @@ def release_to_elasticsearch(release):      extra = release.extra or dict()      if extra: -        # TODO: longtail OA check from GROBID here -        if extra.get('in_kbart'): -            # NOTE: not actually setting this anywhere -            in_kbart = True          if extra.get('is_oa'): -            # NOTE: not actually setting this anywhere +            # NOTE: not actually setting this anywhere... but could              is_oa = True -        if extra.get('grobid'): -            if not t.get('container_name'): -                t['container_name'] = extra['grobid'].get('container_name') -            if extra['grobid'].get('longtail_oa'): -                is_longtail_oa = True +        if extra.get('longtail_oa'): +            # sometimes set by GROBID/matcher +            is_oa = True +            is_longtail_oa = True +        if not t.get('container_name'): +            t['container_name'] = extra.get('container_name')          if extra.get('crossref'):              if extra['crossref'].get('archive'):                  # all crossref archives are KBART, I believe @@ -163,8 +197,101 @@ def release_to_elasticsearch(release):      t['is_oa'] = is_oa      t['is_longtail_oa'] = is_longtail_oa      t['in_kbart'] = in_kbart +    t['in_jstor'] = in_jstor      t['in_web'] = in_web      t['in_dweb'] = in_dweb -    t['in_ia'] = in_ia -    t['is_preserved'] = in_ia or in_kbart +    t['in_ia'] = bool(in_ia) +    t['is_preserved'] = bool(is_preserved or in_ia or in_kbart or in_jstor) +    return t + +def container_to_elasticsearch(entity): +    """ +    Converts from an entity model/schema to elasticsearch oriented schema. + +    Returns: dict +    Raises exception on error (never returns None) +    """ + +    if entity.state in ('redirect', 'deleted'): +        return dict( +            ident = entity.ident, +            state = entity.state, +        ) +    elif entity.state != 'active': +        raise ValueError("Unhandled entity state: {}".format(entity.state)) + +    # First, the easy ones (direct copy) +    t = dict( +        ident = entity.ident, +        state = entity.state, +        revision = entity.revision, + +        name = entity.name, +        publisher = entity.publisher, +        container_type = entity.container_type, +        issnl = entity.issnl, +        wikidata_qid = entity.wikidata_qid, + +        entity_status = entity.entity_status, +        language = entity.language, +        license = entity.license_slug, +        doi = entity.doi, +        pmid = entity.pmid, +        isbn13 = entity.isbn13, +        core_id = entity.core_id, +        arxiv_id = entity.core_id, +        jstor_id = entity.jstor_id, +    ) + +    # TODO: region, discipline +    # TODO: single primary language? +    for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'): +        if entity.extra.get(key): +            t[key] = entity.extra[key] + +    in_doaj = None +    in_road = None +    # TODO: not currently implemented +    in_doi = None +    # TODO: would be nice to have 'in_doaj_works', or maybe just "any_pid" +    #in_doaj_works = None +    in_sherpa_romeo = None +    is_oa = None +    # TODO: not actually set/stored anywhere? +    is_longtail_oa = None +    any_kbart = None +    any_jstor = None +    any_ia_sim = None + +    extra = entity.extra +    if extra.get('doaj'): +        if extra['doaj'].get('as_of'): +            in_doaj = True +    if extra.get('road'): +        if extra['road'].get('as_of'): +            in_road = True +    if extra.get('default_license'): +        if extra['default_license'].startswith('CC-'): +            is_oa = True +    if extra.get('sherpa_romeo'): +        in_sherpa_romeo = True +        if extra['sherpa_romeo'].get('color') == 'white': +            is_oa = False +    if extra.get('kbart'): +        any_kbart = True +        if extra['kbart'].get('jstor'): +            any_jstor = True +    if extra.get('ia'): +        if extra['ia'].get('sim'): +            any_ia_sim = True + +    t['in_doaj'] = is_doaj +    t['in_road'] = is_road +    t['in_doi'] = in_doi +    t['in_sherpa_romeo'] = in_sherpa_romeo +    t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa +    t['is_longtail_oa'] = is_longtail_oa +    t['any_kbart'] = any_ia_sim +    t['any_jstor'] = any_ia_sim +    t['any_ia_sim'] = bool(any_ia_sim)      return t diff --git a/python/tests/files/journal_extra_metadata.snip.csv b/python/tests/files/journal_extra_metadata.snip.csv deleted file mode 100644 index 8cc50ee9..00000000 --- a/python/tests/files/journal_extra_metadata.snip.csv +++ /dev/null @@ -1,10 +0,0 @@ -ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count -0000-0019,False,False,True,False,Publishers weekly,,http://www.publishersweekly.com/,,0000-0019,2150-4008,0.0,False,False,False,,http://www.publishersweekly.com/,301.0,200.0,https://www.publishersweekly.com/,3xx,200,publishersweekly.com,1055.0 -0001-0782,False,False,True,True,Communications of the ACM,Association for Computing Machinery,http://www.acm.org/pubs/cacm/,,0001-0782,1557-7317,11894.0,True,False,True,55.0,http://www.acm.org/pubs/cacm/,301.0,200.0,https://cacm.acm.org/,3xx,200,acm.org,9.0 -0001-1452,False,False,True,True,AIAA Journal,American Institute of Aeronautics and Astronautics,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,,0001-1452,1533-385X,24193.0,True,False,True,6.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404.0,404.0,http://www.aiaa.org/content.cfm?pageid=322&lupubid=2,404,404,aiaa.org,16.0 -0001-1541,False,False,True,True,AIChE Journal,Wiley Blackwell (John Wiley & Sons),http://www.aiche.org/Publications/AIChEJournal/index.aspx,,0001-1541,1547-5905,15860.0,True,False,True,607.0,http://www.aiche.org/Publications/AIChEJournal/index.aspx,301.0,200.0,https://www.aiche.org/publications/journals/aiche-journal,3xx,200,aiche.org,25.0 -0001-2092,False,False,True,True,AORN Journal,Wiley Blackwell (John Wiley & Sons),http://www.aorn.org/AORNJournal/,,0001-2092,1878-0369,12413.0,True,False,True,607.0,http://www.aorn.org/AORNJournal/,301.0,200.0,https://www.aorn.org/aorn-journal,3xx,200,aorn.org,0.0 -0001-2343,False,False,True,True,Archiv fuer Rechts- und Sozialphilosphie,Franz Steiner Verlag GmbH,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,,0001-2343,2363-5614,14.0,True,False,False,2.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200.0,200.0,http://www.steiner-verlag.de/programm/zeitschriften/archiv-fuer-rechts-und-sozialphilosophie/,200,200,steiner-verlag.de,226.0 -0001-2351,False,False,True,True,Transactions of the ASAE,American Society of Agricultural and Biological Engineers,,,0001-2351,2151-0059,11515.0,True,False,True,3.0,,,,,,,,0.0 -0001-2491,False,False,True,False,ASHRAE Journal,,http://www.ashrae.org/,,0001-2491,1943-6637,0.0,False,False,False,,http://www.ashrae.org/,301.0,200.0,https://www.ashrae.org/,3xx,200,ashrae.org,684.0 -0001-2505,False,False,True,False,ASHRAE Transactions,,http://www.ashrae.org/template/AssetDetail/assetid/25903,,0001-2505,,0.0,False,False,False,,http://www.ashrae.org/template/AssetDetail/assetid/25903,301.0,404.0,https://www.ashrae.org/template/AssetDetail/assetid/25903,3xx,404,ashrae.org,684.0 diff --git a/python/tests/files/journal_metadata.sample.json b/python/tests/files/journal_metadata.sample.json new file mode 100644 index 00000000..56df2ea1 --- /dev/null +++ b/python/tests/files/journal_metadata.sample.json @@ -0,0 +1,20 @@ +{"publisher": "Peter Lang International Academic Publishers", "sherpa_romeo": {"color": "yellow"}, "name": "Monographs in Linguistics and the Philosophy of Language", "issnp": "1056-5019", "issnl": "1056-5019"} +{"ia": {"homepage_url": "http://arjournals.annualreviews.org/loi/pharmtox", "homepage_status": -1}, "issnp": "0362-1642", "issne": "1545-4304", "urls": ["http://arjournals.annualreviews.org/loi/pharmtox"], "publisher": "Annual Reviews", "kbart": {"portico": {"year_spans": [[2011, 2017]]}}, "country": "us", "abbrev": "Annu. Rev. Pharmacol. Toxicol.", "crossref": {"any": true, "doi": "10.1146/pharmtox.711"}, "languages": ["en"], "issnl": "0362-1642", "name": "Annual Review of Pharmacology and Toxicology", "sherpa_romeo": {"color": "yellow"}, "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 437839}, "sim": {"last_year": 2009, "id": "5091", "year_spans": [[1961, 2009]], "first_year": 1961}} +{"name": "The international journal of applied radiation and isotopes", "issnp": "0020-708X", "issne": "1878-1284", "publisher": "Elsevier", "kbart": {"clockss": {"year_spans": [[1957, 1985]]}, "portico": {"year_spans": [[1956, 1985]]}}, "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 447572}, "abbrev": "Int J Appl Radiat Isot", "crossref": {"any": true}, "issnl": "0020-708X"} +{"issnp": "2449-8920", "issne": "2449-8939", "road": {"as_of": "2018-01-24"}, "languages": ["en"], "publisher": "Uniwersytet Jagiellonski \u2013 Wydawnictwo Uniwersytetu Jagiellonskiego", "country": "pl", "urls": ["http://www.ejournals.eu/ijcm/", "http://www.ejournals.eu/ijcm"], "crossref": {"any": true}, "issnl": "2449-8920", "name": "International Journal of Contemporary Management", "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 488900}, "ia": {"homepage_url": "http://www.ejournals.eu/ijcm", "homepage_status": 200}} +{"issnp": "1976-1317", "default_license": "CC-BY-NC-ND", "road": {"as_of": "2018-01-24"}, "doaj": {"as_of": "2019-01-24", "seal": false, "archive": ["national-library"]}, "languages": ["en"], "publisher": "Elsevier", "kbart": {"clockss": {"year_spans": [[2007, 2018]]}, "portico": {"year_spans": [[2007, 2018]]}}, "country": "kr", "urls": ["http://www.journals.elsevier.com/asian-nursing-research", "http://www.asian-nursingresearch.com"], "crossref": {"any": true}, "issnl": "1976-1317", "abbrev": "Asian Nurs Res (Korean Soc Nurs Sci)", "name": "Asian Nursing Research", "sherpa_romeo": {"color": "green"}, "ia": {"homepage_url": "https://www.journals.elsevier.com/asian-nursing-research", "homepage_status": 200}, "issne": "2093-7482"} +{"name": "Research on Managing Groups and Teams", "issnp": "1534-0856", "languages": ["en"], "ia": {"homepage_url": "http://www.emeraldinsight.com/series/rmgt", "homepage_status": -1}, "country": "us", "urls": ["http://www.emeraldinsight.com/series/rmgt"], "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 484721}, "issnl": "1534-0856"} +{"name": "Asian Journal of Medical Sciences", "issne": "2091-0576", "road": {"as_of": "2018-01-24"}, "languages": ["en"], "publisher": "Manipal Colleges of Medical Sciences", "ia": {"homepage_url": "https://www.nepjol.info/index.php/AJMS", "homepage_status": 200}, "country": "np", "urls": ["http://www.nepjol.info/index.php/AJMS/about", "http://www.nepjol.info/index.php/AJMS"], "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 478318}, "issnl": "2091-0576"} +{"name": "ATZproduktion", "issnp": "1865-4908", "issne": "2192-8886", "publisher": "Springer Fachmedien Wiesbaden GmbH", "crossref": {"any": true}, "issnl": "1865-4908"} +{"publisher": "Boom Uitgevers Den Haag", "name": "StAB", "issnp": "1573-806X", "crossref": {"any": true}, "issnl": "1573-806X"} +{"publisher": "University of Victoria", "sherpa_romeo": {"color": "blue"}, "name": "Musicological Explorations", "issnp": "1711-9235", "issnl": "1711-9235"} +{"publisher": "Transstellar Journal Publications and Research Consultancy Private Limited", "name": "International Journal of Robotics Research and Development", "issnp": "2250-1592", "crossref": {"any": true}, "issnl": "2250-1592"} +{"name": "BioProcess international", "issnp": "1542-6319", "abbrev": "Bioprocess Int", "issne": "1945-7065", "issnl": "1542-6319"} +{"name": "Journal of the American Podiatry Association", "issnp": "0003-0538", "abbrev": "J Am Podiatry Assoc", "issnl": "0003-0538"} +{"publisher": "Masson Editeur", "name": "Bulletin de la Societe francaise de dermatologie et de syphiligraphie", "languages": ["en"], "sim": {"last_year": 1972, "id": "3407", "year_spans": [[1890, 1941], [1968, 1972]], "gaps": [1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967], "first_year": 1890}, "issnl": "0049-1071"} +{"name": "International journal of innovative and applied research", "urls": ["http://www.journalijiar.com"], "ia": {"homepage_url": "http://www.journalijiar.com/", "homepage_status": 200}, "road": {"as_of": "2018-01-24"}, "issnl": "2348-0319"} +{"issnp": "1025-6415", "issne": "1025-6415", "road": {"as_of": "2018-01-24"}, "languages": ["ru"], "publisher": "Natsional'na Akademiya Nauk Ukrainy", "abbrev": "Dopov Natsionalnoi Akad Nauk Ukr Matemat Pryrodoznavstvo Tekhnichni Nauky", "country": "ua", "urls": ["http://www.nbuv.gov.ua/portal/all/reports/index.html", "http://dopovidi-nanu.org.ua/"], "crossref": {"any": true, "doi": "10.15407/dopovidi"}, "issnl": "1025-6415", "name": "Natsional'na Akademiya Nauk Ukrainy. Dopovidi: naukovyi zhurnal", "norwegian": {"level": 1, "as_of": "2018-03-02", "id": 477246}, "ia": {"homepage_url": "http://dopovidi-nanu.org.ua/", "homepage_status": 200}} +{"publisher": "International Bee Research Association", "name": "Journal of ApiProduct and ApiMedical Science", "crossref": {"any": true, "doi": "10.3896/IBRA.4"}, "issne": "1759-7986", "issnl": "1759-7986"} +{"publisher": "E3 Journals Ltd", "name": "E3 Journal of Agricultural Research and Development", "crossref": {"any": true, "doi": "10.18685/EJARD"}, "issne": "2276-9897", "issnl": "2276-9897"} +{"name": "Tung wu hsueh tsa chih : Dongwuxue zazhi", "issnp": "0250-3263", "abbrev": "Dongwuxue Zazhi", "issnl": "0250-3263"} +{"name": "Buca e\u011fitim fak\u00fcltesi dergisi (Online)", "urls": ["http://www.befjournal.com/index.php/dergi/index"], "ia": {"homepage_url": "http://www.befjournal.com/cgi-sys/suspendedpage.cgi", "homepage_status": 200}, "road": {"as_of": "2018-01-24"}, "issnl": "1302-5147"} diff --git a/python/tests/import_journal_metadata.py b/python/tests/import_journal_metadata.py index a2b10a65..cfeee517 100644 --- a/python/tests/import_journal_metadata.py +++ b/python/tests/import_journal_metadata.py @@ -1,6 +1,6 @@  import pytest -from fatcat_tools.importers import JournalMetadataImporter, CsvPusher +from fatcat_tools.importers import JournalMetadataImporter, JsonLinePusher  from fixtures import api @@ -10,15 +10,15 @@ def journal_metadata_importer(api):  # TODO: use API to check that entities actually created...  def test_journal_metadata_importer_batch(journal_metadata_importer): -    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: -        CsvPusher(journal_metadata_importer, f).run() +    with open('tests/files/journal_metadata.sample.json', 'r') as f: +        JsonLinePusher(journal_metadata_importer, f).run()  def test_journal_metadata_importer(journal_metadata_importer):      last_index = journal_metadata_importer.api.get_changelog(limit=1)[0].index -    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: +    with open('tests/files/journal_metadata.sample.json', 'r') as f:          journal_metadata_importer.bezerk_mode = True -        counts = CsvPusher(journal_metadata_importer, f).run() -    assert counts['insert'] == 9 +        counts = JsonLinePusher(journal_metadata_importer, f).run() +    assert counts['insert'] == 20      assert counts['exists'] == 0      assert counts['skip'] == 0 @@ -30,10 +30,10 @@ def test_journal_metadata_importer(journal_metadata_importer):      assert eg.extra['git_rev']      assert "fatcat_tools.JournalMetadataImporter" in eg.extra['agent'] -    with open('tests/files/journal_extra_metadata.snip.csv', 'r') as f: +    with open('tests/files/journal_metadata.sample.json', 'r') as f:          journal_metadata_importer.reset()          journal_metadata_importer.bezerk_mode = False -        counts = CsvPusher(journal_metadata_importer, f).run() +        counts = JsonLinePusher(journal_metadata_importer, f).run()      assert counts['insert'] == 0 -    assert counts['exists'] == 9 +    assert counts['exists'] == 20      assert counts['skip'] == 0 | 
