diff options
| -rw-r--r-- | extra/elasticsearch/README.md | 16 | ||||
| -rw-r--r-- | extra/elasticsearch/changelog_schema.json | 5 | ||||
| -rw-r--r-- | extra/elasticsearch/fatcat_schema.json | 109 | ||||
| -rwxr-xr-x | python/fatcat_export.py | 45 | ||||
| -rw-r--r-- | python/fatcat_tools/__init__.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/common.py | 4 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/journal_metadata.py | 10 | ||||
| -rw-r--r-- | python/fatcat_tools/transforms.py | 51 | 
8 files changed, 214 insertions, 30 deletions
| diff --git a/extra/elasticsearch/README.md b/extra/elasticsearch/README.md index 761ad6ab..691c6ed5 100644 --- a/extra/elasticsearch/README.md +++ b/extra/elasticsearch/README.md @@ -38,13 +38,17 @@ There is a Dockerfile in this directory which includes this installation.  Drop and rebuild the schema: -    http delete :9200/fatcat -    http put :9200/fatcat < release_schema.json +    http delete :9200/fatcat_release +    http delete :9200/fatcat_container +    http delete :9200/fatcat_changelog +    http put :9200/fatcat_release < release_schema.json +    http put :9200/fatcat_container < container_schema.json +    http put :9200/fatcat_changelog < changelog_schema.json  Put a single object (good for debugging): -    head -n1 examples.json | http post :9200/fatcat/release/0 -    http get :9200/fatcat/release/0 +    head -n1 examples.json | http post :9200/fatcat_release/release/0 +    http get :9200/fatcat_release/release/0  Bulk insert from a file on disk: @@ -53,14 +57,14 @@ Bulk insert from a file on disk:  Or, in a bulk production live-stream conversion:      export LC_ALL=C.UTF-8 -    time zcat /srv/fatcat/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release +    time zcat /srv/fatcat_release/snapshots/fatcat_release_dump_expanded.json.gz | ./fatcat_export.py transform-releases - - | esbulk -verbose -size 20000 -id ident -w 8 -index fatcat -type release  ## Full-Text Querying  A generic full-text "query string" query look like this (replace "blood" with  actual query string, and "size" field with the max results to return): -    GET /fatcat/release/_search +    GET /fatcat_release/release/_search      {        "query": {          "query_string": { diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json index 7a7ec90c..f3211e99 100644 --- a/extra/elasticsearch/changelog_schema.json +++ b/extra/elasticsearch/changelog_schema.json @@ -16,8 +16,9 @@      "changelog": {          "properties": {              "index":            { "type": "integer" }, -            "editgorup_id":     { "type": "keyword" }, -            "timestamp":        { "type": "datetime" }, +            "editgroup_id":     { "type": "keyword" }, +            "timestamp":        { "type": "date" }, +            "editor_id":        { "type": "keyword" },              "username":         { "type": "keyword" },              "is_bot":           { "type": "boolean" },              "is_admin":         { "type": "boolean" }, diff --git a/extra/elasticsearch/fatcat_schema.json b/extra/elasticsearch/fatcat_schema.json new file mode 100644 index 00000000..05583330 --- /dev/null +++ b/extra/elasticsearch/fatcat_schema.json @@ -0,0 +1,109 @@ +{ +"settings": { +    "index": { +        "analysis": { +            "analyzer": { +                "default": { +                    "type": "custom", +                    "tokenizer": "standard", +                    "filter": [ "lowercase", "asciifolding" ] +                }, +                "textIcu": { +                    "type": "custom", +                    "tokenizer": "icu_tokenizer", +                    "char_filter": [ "icu_normalizer" ], +                    "filter": [ "icu_folding" ] +                }, +                "textIcuSearch": { +                    "type": "custom", +                    "tokenizer": "icu_tokenizer", +                    "char_filter": [ "icu_normalizer" ], +                    "filter": [ "icu_folding" ] +                } +            } +        } +    } +}, +"mappings": { +    "release": { +        "properties": { +            "ident":          { "type": "keyword" }, +            "state":          { "type": "keyword" }, +            "revision":       { "type": "keyword" }, +            "title":          { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "original_title": { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "release_date":   { "type": "date" }, +            "release_year":   { "type": "integer" }, +            "release_type":   { "type": "keyword" }, +            "release_status": { "type": "keyword" }, +            "language":       { "type": "keyword" }, +            "doi":            { "type": "keyword" }, +            "pmid":           { "type": "keyword" }, +            "pmcid":          { "type": "keyword" }, +            "isbn13":         { "type": "keyword" }, +            "wikidata_qid":   { "type": "keyword" }, +            "core_id":        { "type": "keyword" }, +            "axiv_id":        { "type": "keyword" }, +            "jstor_id":       { "type": "keyword" }, +            "license":        { "type": "keyword" }, +            "publisher":                { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "container_name":           { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "container_issnl":          { "type": "keyword" }, +            "container_type":           { "type": "keyword" }, +            "contrib_count":        { "type": "integer" }, +            "contrib_names":        { "type": "text", "index": true, "analyzer": "textIcu", "search_analyzer":"textIcuSearch" }, +            "ref_count":            { "type": "integer" }, +            "file_count":           { "type": "integer" }, +            "fileset_count":        { "type": "integer" }, +            "webcapture_count":     { "type": "integer" }, +            "any_abstract":         { "type": "boolean" }, + +            "best_pdf_url":         { "type": "keyword" }, +            "ia_pdf_url":           { "type": "keyword" }, +            "is_oa":                { "type": "boolean" }, +            "is_longtail_oa":       { "type": "boolean" }, +            "is_preserved":         { "type": "boolean" }, +            "in_kbart":             { "type": "boolean" }, +            "in_jstor":             { "type": "boolean" }, +            "in_dweb":              { "type": "boolean" }, +            "in_web":               { "type": "boolean" }, +            "in_ia":                { "type": "boolean" }, +            "in_ia_sim":            { "type": "boolean" }, +            "in_shadows":           { "type": "boolean" }, + +            "author":         { "type": "alias", "path": "contrib_names" }, +            "journal":        { "type": "alias", "path": "container_name" }, +            "date":           { "type": "alias", "path": "release_date" }, +            "year":           { "type": "alias", "path": "release_year" }, +            "issn":           { "type": "alias", "path": "container_issnl" }, +            "oa":             { "type": "alias", "path": "is_oa" }, +            "longtail":       { "type": "alias", "path": "is_longtail_oa" }, +            "lang":           { "type": "alias", "path": "language" }, +            "file_pdf_url":   { "type": "alias", "path": "best_pdf_url" }, +            "is_kept":        { "type": "alias", "path": "in_kbart" } +        } +    }, +    "changelog": { +        "properties": { +            "index":            { "type": "integer" }, +            "editgorup_id":     { "type": "keyword" }, +            "timestamp":        { "type": "date" }, +            "username":         { "type": "keyword" }, +            "is_bot":           { "type": "boolean" }, +            "is_admin":         { "type": "boolean" }, +            "agent":            { "type": "keyword" }, +            "containers":       { "type": "integer" }, +            "creators":         { "type": "integer" }, +            "files":            { "type": "integer" }, +            "filessets":        { "type": "integer" }, +            "webcaptures":      { "type": "integer" }, +            "releases":         { "type": "integer" }, +            "works":            { "type": "integer" }, +            "created":          { "type": "integer" }, +            "updated":          { "type": "integer" }, +            "deleted":          { "type": "integer" }, +            "total":            { "type": "integer" } +        } +    } +} +} diff --git a/python/fatcat_export.py b/python/fatcat_export.py index cf8bf1c3..027d6c0a 100755 --- a/python/fatcat_export.py +++ b/python/fatcat_export.py @@ -12,9 +12,10 @@ import json  import argparse  import fatcat_client  from fatcat_client.rest import ApiException -from fatcat_client import ReleaseEntity +from fatcat_client import ReleaseEntity, ContainerEntity, ChangelogEntry  from fatcat_tools import uuid2fcid, entity_from_json, entity_to_dict, \ -    release_to_elasticsearch, public_api +    release_to_elasticsearch, container_to_elasticsearch, \ +    changelog_to_elasticsearch, public_api  def run_export_releases(args): @@ -30,9 +31,27 @@ def run_transform_releases(args):          line = line.strip()          if not line:              continue -        release = entity_from_json(line, ReleaseEntity) +        entity = entity_from_json(line, ReleaseEntity)          args.json_output.write( -            json.dumps(release_to_elasticsearch(release)) + '\n') +            json.dumps(release_to_elasticsearch(entity)) + '\n') + +def run_transform_containers(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ContainerEntity) +        args.json_output.write( +            json.dumps(container_to_elasticsearch(entity)) + '\n') + +def run_transform_changelogs(args): +    for line in args.json_input: +        line = line.strip() +        if not line: +            continue +        entity = entity_from_json(line, ChangelogEntry) +        args.json_output.write( +            json.dumps(changelog_to_elasticsearch(entity)) + '\n')  def run_export_changelog(args):      api = args.api @@ -74,6 +93,24 @@ def main():          help="where to send output",          default=sys.stdout, type=argparse.FileType('w')) +    sub_transform_containers = subparsers.add_parser('transform-containers') +    sub_transform_containers.set_defaults(func=run_transform_containers) +    sub_transform_containers.add_argument('json_input', +        help="JSON-per-line of container entities", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_transform_containers.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) + +    sub_transform_changelogs = subparsers.add_parser('transform-changelogs') +    sub_transform_changelogs.set_defaults(func=run_transform_changelogs) +    sub_transform_changelogs.add_argument('json_input', +        help="JSON-per-line of changelog entries", +        default=sys.stdin, type=argparse.FileType('r')) +    sub_transform_changelogs.add_argument('json_output', +        help="where to send output", +        default=sys.stdout, type=argparse.FileType('w')) +      sub_changelog = subparsers.add_parser('changelog')      sub_changelog.set_defaults(func=run_export_changelog)      sub_changelog.add_argument('--start', diff --git a/python/fatcat_tools/__init__.py b/python/fatcat_tools/__init__.py index e2b1e3a2..64c45062 100644 --- a/python/fatcat_tools/__init__.py +++ b/python/fatcat_tools/__init__.py @@ -1,4 +1,6 @@  from .api_auth import authenticated_api, public_api  from .fcid import fcid2uuid, uuid2fcid -from .transforms import entity_to_dict, entity_from_json, release_to_elasticsearch +from .transforms import entity_to_dict, entity_from_json, \ +    release_to_elasticsearch, container_to_elasticsearch, \ +    changelog_to_elasticsearch diff --git a/python/fatcat_tools/importers/common.py b/python/fatcat_tools/importers/common.py index ebdce56f..a29b3019 100644 --- a/python/fatcat_tools/importers/common.py +++ b/python/fatcat_tools/importers/common.py @@ -236,8 +236,8 @@ class EntityImporter:          self._entity_queue.append(entity)          if len(self._entity_queue) >= self.edit_batch_size:              self.insert_batch(self._entity_queue) -            self.counts['insert'] += len(_entity_queue) -            self._entity_queue = 0 +            self.counts['insert'] += len(self._entity_queue) +            self._entity_queue = []      def want(self, raw_record):          """ diff --git a/python/fatcat_tools/importers/journal_metadata.py b/python/fatcat_tools/importers/journal_metadata.py index 7f6b1ee8..be62d63a 100644 --- a/python/fatcat_tools/importers/journal_metadata.py +++ b/python/fatcat_tools/importers/journal_metadata.py @@ -44,7 +44,7 @@ class JournalMetadataImporter(EntityImporter):              editgroup_extra=eg_extra)      def want(self, raw_record): -        if raw_record.get('issnl'): +        if raw_record.get('issnl') and raw_record.get('name'):              return True          return False @@ -55,6 +55,10 @@ class JournalMetadataImporter(EntityImporter):          returns a ContainerEntity (or None if invalid or couldn't parse)          """ +        if not row.get('name'): +            # Name is required (by schema) +            return None +          extra = dict()          for key in ('issne', 'issnp', 'languages', 'country', 'urls', 'abbrev',              'coden', 'aliases', 'original_name', 'first_year', 'last_year', @@ -76,8 +80,10 @@ class JournalMetadataImporter(EntityImporter):          extra_ia = dict()          # TODO: would like an ia.longtail_ia flag          if row.get('sim'): +            # NB: None case of the .get() here is blech, but othrwise +            # extra['ia'].get('sim') would be false-y, breaking 'any_ia_sim' later on              extra_ia['sim'] = { -                'year_spans': row['sim']['year_spans'], +                'year_spans': row['sim'].get('year_spans'),              }          if extra_ia:              extra['ia'] = extra_ia diff --git a/python/fatcat_tools/transforms.py b/python/fatcat_tools/transforms.py index a85c877c..7bb75c3e 100644 --- a/python/fatcat_tools/transforms.py +++ b/python/fatcat_tools/transforms.py @@ -231,20 +231,12 @@ def container_to_elasticsearch(entity):          container_type = entity.container_type,          issnl = entity.issnl,          wikidata_qid = entity.wikidata_qid, - -        entity_status = entity.entity_status, -        language = entity.language, -        license = entity.license_slug, -        doi = entity.doi, -        pmid = entity.pmid, -        isbn13 = entity.isbn13, -        core_id = entity.core_id, -        arxiv_id = entity.core_id, -        jstor_id = entity.jstor_id,      )      # TODO: region, discipline      # TODO: single primary language? +    if not entity.extra: +        entity.extra = dict()      for key in ('country', 'languages', 'mimetypes', 'first_year', 'last_year'):          if entity.extra.get(key):              t[key] = entity.extra[key] @@ -285,13 +277,46 @@ def container_to_elasticsearch(entity):          if extra['ia'].get('sim'):              any_ia_sim = True -    t['in_doaj'] = is_doaj -    t['in_road'] = is_road +    t['in_doaj'] = in_doaj +    t['in_road'] = in_road      t['in_doi'] = in_doi      t['in_sherpa_romeo'] = in_sherpa_romeo -    t['is_oa'] = in_doaj or in_road or is_longtail_oa or ia_oa +    t['is_oa'] = in_doaj or in_road or is_longtail_oa or is_oa      t['is_longtail_oa'] = is_longtail_oa      t['any_kbart'] = any_ia_sim      t['any_jstor'] = any_ia_sim      t['any_ia_sim'] = bool(any_ia_sim)      return t + + +def changelog_to_elasticsearch(entity): + +    editgroup = entity.editgroup +    t = dict( +        index=entity.index, +        editgroup_id=entity.editgroup_id, +        timestamp=entity.timestamp, +        editor_id=editgroup.editor_id, +    ) + +    extra = editgroup.extra or dict() +    if extra.get('agent'): +        t['agent'] = extra['agent'] + +    t['containers'] = len(editgroup.edits.containers) +    t['creators'] = len(editgroup.edits.containers) +    t['files'] = len(editgroup.edits.containers) +    t['filesets'] = len(editgroup.edits.containers) +    t['webcaptures'] = len(editgroup.edits.containers) +    t['releases'] = len(editgroup.edits.containers) +    t['works'] = len(editgroup.edits.containers) + +    # TODO: parse and pull out counts +    #created = 0 +    #updated = 0 +    #deleted = 0 +    #t['created'] = created +    #t['updated'] = updated +    #t['deleted'] = deleted +    #t['total'] = created + updated + deleted +    return t | 
