aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorBryan Newbold <bnewbold@robocracy.org>2020-01-29 16:00:03 -0800
committerBryan Newbold <bnewbold@robocracy.org>2020-01-29 21:59:05 -0800
commit901cf998ce7d8f896cf5d609719b1defd96d01d4 (patch)
tree85a5eebe76e8ca4df3b1b4217ecc9e829f50583c
parent55a4f211532c93d8164b0d4719dc0413005941ea (diff)
downloadfatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.tar.gz
fatcat-901cf998ce7d8f896cf5d609719b1defd96d01d4.zip
first implementation of ES file schema
Includes a trivial test and transform, but not any workers or doc updates.
-rw-r--r--extra/elasticsearch/file_schema.json46
-rw-r--r--python/fatcat_tools/transforms/__init__.py2
-rw-r--r--python/fatcat_tools/transforms/elasticsearch.py45
-rw-r--r--python/tests/transform_tests.py25
4 files changed, 115 insertions, 3 deletions
diff --git a/extra/elasticsearch/file_schema.json b/extra/elasticsearch/file_schema.json
new file mode 100644
index 00000000..66d81e0b
--- /dev/null
+++ b/extra/elasticsearch/file_schema.json
@@ -0,0 +1,46 @@
+{
+"settings": {
+ "index": {
+ "analysis": {
+ "analyzer": {
+ "default": {
+ "type": "custom",
+ "tokenizer": "standard",
+ "filter": [ "lowercase", "asciifolding" ]
+ }
+ }
+ }
+ }
+},
+"mappings": {
+ "changelog": {
+ "properties": {
+ "ident": { "type": "keyword", "doc_values": false },
+ "state": { "type": "keyword" },
+ "revision": { "type": "keyword", "doc_values": false },
+
+ "release_ids": { "type": "keyword", "doc_values": false },
+ "release_count": { "type": "integer" },
+ "mimetype": { "type": "keyword" },
+ "size_bytes": { "type": "integer" },
+ "sha1": { "type": "keyword", "doc_values": false },
+ "sha256": { "type": "keyword", "doc_values": false },
+ "md5": { "type": "keyword", "doc_values": false },
+
+ "domains": { "type": "keyword" },
+ "hosts": { "type": "keyword" },
+ "rels": { "type": "keyword" },
+ "in_ia": { "type": "boolean" },
+
+ "release_id": { "type": "alias", "path": "release_ids" },
+ "sha1hex": { "type": "alias", "path": "sha1hex" },
+ "sha256hex": { "type": "alias", "path": "sha256hex" },
+ "md5hex": { "type": "alias", "path": "md5hex" },
+ "size": { "type": "alias", "path": "size_bytes" },
+ "domain": { "type": "alias", "path": "domains" },
+ "host": { "type": "alias", "path": "host" },
+ "rel": { "type": "alias", "path": "rel" }
+ }
+ }
+}
+}
diff --git a/python/fatcat_tools/transforms/__init__.py b/python/fatcat_tools/transforms/__init__.py
index 6a4b1bba..3f4700ff 100644
--- a/python/fatcat_tools/transforms/__init__.py
+++ b/python/fatcat_tools/transforms/__init__.py
@@ -1,5 +1,5 @@
from .entities import entity_to_dict, entity_from_json, entity_from_dict
-from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch
+from .elasticsearch import release_to_elasticsearch, container_to_elasticsearch, changelog_to_elasticsearch, file_to_elasticsearch
from .csl import release_to_csl, citeproc_csl
from .ingest import release_ingest_request
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 3a53db4d..8141a8b9 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -357,3 +357,48 @@ def changelog_to_elasticsearch(entity):
#t['deleted'] = deleted
#t['total'] = created + updated + deleted
return t
+
+
+def file_to_elasticsearch(entity):
+ """
+ Converts from an entity model/schema to elasticsearch oriented schema.
+
+ Returns: dict
+ Raises exception on error (never returns None)
+ """
+
+ if entity.state in ('redirect', 'deleted'):
+ return dict(
+ ident = entity.ident,
+ state = entity.state,
+ )
+ elif entity.state != 'active':
+ raise ValueError("Unhandled entity state: {}".format(entity.state))
+
+ # First, the easy ones (direct copy)
+ t = dict(
+ ident = entity.ident,
+ state = entity.state,
+ revision = entity.revision,
+ release_ids = entity.release_ids,
+ release_count = len(entity.release_ids),
+ mimetype = entity.mimetype,
+ size_bytes = entity.size,
+ sha1 = entity.sha1,
+ sha256 = entity.sha256,
+ md5 = entity.md5,
+ rel = [u.rel for u in entity.urls],
+ )
+
+ # TODO: domain, hosts (from urls; use proper urlcanon)
+ t['rel'] = list(set([u.rel for u in entity.urls]))
+ t['host'] = []
+ t['domain'] = []
+
+ in_ia = False
+ for u in entity.urls:
+ if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
+ in_ia = True
+ t['in_ia'] = bool(in_ia)
+
+ return t
diff --git a/python/tests/transform_tests.py b/python/tests/transform_tests.py
index f254e117..7b583ac4 100644
--- a/python/tests/transform_tests.py
+++ b/python/tests/transform_tests.py
@@ -7,6 +7,7 @@ from fixtures import api
from import_journal_metadata import journal_metadata_importer
from import_crossref import crossref_importer
+from import_matched import matched_importer
def test_basic_elasticsearch_convert(crossref_importer):
with open('tests/files/crossref-works.single.json', 'r') as f:
@@ -72,14 +73,34 @@ def test_rich_elasticsearch_convert():
assert es['ref_count'] == 2
assert es['ref_linked_count'] == 1
-def test_elasticsearch_from_json():
+def test_elasticsearch_release_from_json():
r = entity_from_json(open('./tests/files/math_universe.json', 'r').read(), ReleaseEntity)
release_to_elasticsearch(r)
-def test_elasticsearch_container_convert(journal_metadata_importer):
+def test_elasticsearch_container_transform(journal_metadata_importer):
with open('tests/files/journal_metadata.sample.json', 'r') as f:
raw = json.loads(f.readline())
c = journal_metadata_importer.parse_record(raw)
c.state = 'active'
es = container_to_elasticsearch(c)
assert es['publisher'] == c.publisher
+
+def test_elasticsearch_file_transform(matched_importer):
+ with open('tests/files/example_matched.json', 'r') as f:
+ raw = json.loads(f.readline())
+ f = matched_importer.parse_record(raw)
+
+ f.state = 'active'
+ es = file_to_elasticsearch(f)
+ assert es['sha1'] == f.sha1
+ assert es['sha256'] == f.sha256
+ assert es['md5'] == f.md5
+ assert es['size_bytes'] == f.size
+ assert es['mimetype'] == f.mimetype
+ assert es['in_ia'] == True
+ assert 'publisher' in es['rel']
+
+ # XXX: implement hosts and domain parsing with urlcanon
+ #assert 'journals.plos.org' in es['host']
+ #assert 'plos.org' in es['domain']
+