diff options
| author | Bryan Newbold <bnewbold@robocracy.org> | 2021-04-06 20:04:03 -0700 | 
|---|---|---|
| committer | Bryan Newbold <bnewbold@robocracy.org> | 2021-04-06 21:58:54 -0700 | 
| commit | 2e781738937efecbfc527a47ade6c3deaba64247 (patch) | |
| tree | 52278319ae6fe1fafe18ace92959b01bb32e82c4 | |
| parent | 61bd2d65fd1c4fbda2c28d36c5388a610b4d1d14 (diff) | |
| download | fatcat-2e781738937efecbfc527a47ade6c3deaba64247.tar.gz fatcat-2e781738937efecbfc527a47ade6c3deaba64247.zip  | |
container search schema: preservation stats, new fields
Includes transform code updates and partial test coverage.
| -rw-r--r-- | extra/elasticsearch/container_schema.json | 17 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms/elasticsearch.py | 20 | ||||
| -rw-r--r-- | python/tests/transform_elasticsearch.py | 47 | 
3 files changed, 69 insertions, 15 deletions
diff --git a/extra/elasticsearch/container_schema.json b/extra/elasticsearch/container_schema.json index 21b8d4ec..9673e9e3 100644 --- a/extra/elasticsearch/container_schema.json +++ b/extra/elasticsearch/container_schema.json @@ -55,6 +55,7 @@              "issnl":          { "type": "keyword", "normalizer": "default" },              "issns":          { "type": "keyword", "normalizer": "default" },              "wikidata_qid":   { "type": "keyword", "normalizer": "default" }, +            "dblp_prefix":    { "type": "keyword", "normalizer": "default" },              "country_code":   { "type": "keyword", "normalizer": "default" },              "region":         { "type": "keyword", "normalizer": "default" },              "discipline":     { "type": "keyword", "normalizer": "default" }, @@ -74,19 +75,19 @@              "any_jstor":            { "type": "boolean" },              "any_ia_sim":           { "type": "boolean" },              "sherpa_romeo_color":   { "type": "keyword", "normalizer": "default" }, +            "keepers":              { "type": "keyword", "normalizer": "default" }, -            "releases_total": { "type": "integer" }, -            "releases_kbart": { "type": "integer" }, -            "releases_ia":    { "type": "integer" }, -            "releases_ia_sim":          { "type": "integer" }, -            "releases_shadows":         { "type": "integer" }, -            "releases_any_file":        { "type": "integer" }, -            "releases_any_fileset":     { "type": "integer" }, -            "releases_any_webcapture":  { "type": "integer" }, +            "releases_total":           { "type": "integer" }, +            "preservation_bright":      { "type": "integer" }, +            "preservation_dark":        { "type": "integer" }, +            "preservation_shadows_only":{ "type": "integer" }, +            "preservation_none":        { "type": "integer" },              "year":           { "type": "alias", "path": "first_year" },              "type":           { "type": "alias", "path": "container_type" },              "issn":           { "type": "alias", "path": "issns" }, +            "release_count":  { "type": "alias", "path": "releases_total" }, +            "releases_count": { "type": "alias", "path": "releases_total" },              "oa":             { "type": "alias", "path": "is_oa" },              "longtail":       { "type": "alias", "path": "is_longtail_oa" }          } diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py index 5058989c..fe463fa4 100644 --- a/python/fatcat_tools/transforms/elasticsearch.py +++ b/python/fatcat_tools/transforms/elasticsearch.py @@ -377,7 +377,7 @@ def _rte_url_helper(url_obj) -> dict:      return t -def container_to_elasticsearch(entity, force_bool=True): +def container_to_elasticsearch(entity, force_bool=True, stats=None):      """      Converts from an entity model/schema to elasticsearch oriented schema. @@ -411,10 +411,13 @@ def container_to_elasticsearch(entity, force_bool=True):          entity.extra = dict()      for key in ('country', 'languages', 'mimetypes', 'original_name',                  'first_year', 'last_year', 'aliases', 'abbrev', 'region', -                'discipline'): +                'discipline', 'publisher_type'):          if entity.extra.get(key):              t[key] = entity.extra[key] +    if entity.extra.get('dblp') and entity.extra['dblp'].get('prefix'): +        t['dblp_prefix'] = entity.extra['dblp']['prefix'] +      if 'country' in t:          t['country_code'] = t.pop('country') @@ -432,6 +435,7 @@ def container_to_elasticsearch(entity, force_bool=True):      any_kbart = None      any_jstor = None      any_ia_sim = None +    keepers = []      extra = entity.extra      if extra.get('doaj'): @@ -455,6 +459,9 @@ def container_to_elasticsearch(entity, force_bool=True):          any_kbart = True          if extra['kbart'].get('jstor'):              any_jstor = True +        for k, v in extra['kbart'].items(): +            if v and isinstance(v, dict): +                keepers.append(k)      if extra.get('ia'):          if extra['ia'].get('sim'):              any_ia_sim = True @@ -462,6 +469,7 @@ def container_to_elasticsearch(entity, force_bool=True):              is_longtail_oa = True      t['is_superceded'] = bool(extra.get('superceded')) +    t['keepers'] = keepers      t['in_doaj'] = bool(in_doaj)      t['in_road'] = bool(in_road)      t['any_kbart'] = bool(any_kbart) @@ -475,6 +483,14 @@ def container_to_elasticsearch(entity, force_bool=True):          t['is_longtail_oa'] = is_longtail_oa          t['any_jstor'] = any_jstor          t['any_ia_sim'] = any_ia_sim + +    # mix in stats, if provided +    if stats: +        t['releases_total'] = stats['total'] +        t['preservation_bright'] = stats['preservation']['bright'] +        t['preservation_dark'] = stats['preservation']['dark'] +        t['preservation_shadows_only'] = stats['preservation']['shadows_only'] +        t['preservation_none'] = stats['preservation']['none']      return t diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py index 9cf77d4a..ba2b7ea2 100644 --- a/python/tests/transform_elasticsearch.py +++ b/python/tests/transform_elasticsearch.py @@ -147,11 +147,48 @@ def test_elasticsearch_release_from_json():  def test_elasticsearch_container_transform(journal_metadata_importer):      with open('tests/files/journal_metadata.sample.json', 'r') as f: -        raw = json.loads(f.readline()) -        c = journal_metadata_importer.parse_record(raw) -    c.state = 'active' -    es = container_to_elasticsearch(c) -    assert es['publisher'] == c.publisher +        raw1 = json.loads(f.readline()) +        raw2 = json.loads(f.readline()) +        c1 = journal_metadata_importer.parse_record(raw1) +        c1.state = 'active' +        c2 = journal_metadata_importer.parse_record(raw2) +        c2.state = 'active' + +    c1.extra['publisher_type'] = "big5" +    c1.extra['discipline'] = "history" +    es = container_to_elasticsearch(c1) +    assert es['publisher'] == c1.publisher +    assert es['discipline'] == c1.extra['discipline'] +    assert es['publisher_type'] == c1.extra['publisher_type'] +    assert es['keepers'] == [] + +    stats = { +        "ident": "en4qj5ijrbf5djxx7p5zzpjyoq", +        "in_kbart": 11136, +        "in_web": 9501, +        "is_preserved": 11136, +        "issnl": "2050-084X", +        "preservation": { +            "bright": 9501, +            "dark": 1635, +            "none": 0, +            "shadows_only": 0, +            "total": 11136 +        }, +        "release_type": { +            "_unknown": 9, +            "article-journal": 11124, +            "editorial": 2, +            "letter": 1 +        }, +        "total": 11136 +    } +    es = container_to_elasticsearch(c2, stats=stats) +    assert es['name'] == c2.name +    assert es['publisher'] == c2.publisher +    assert es['keepers'] == list(c2.extra['kbart'].keys()) == ["portico"] +    assert es['any_kbart'] == True +  def test_elasticsearch_file_transform(matched_importer):      f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)  | 
