From 5d458a3df7e58e6551d8ec72979e376c62fdd2f7 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 21:52:33 -0800
Subject: fix some transform bugs, add some tests

---
 python/tests/transform_elasticsearch.py | 114 ++++++++++++++++++++++++++++++++
 1 file changed, 114 insertions(+)
 create mode 100644 python/tests/transform_elasticsearch.py

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
new file mode 100644
index 00000000..ab613a0a
--- /dev/null
+++ b/python/tests/transform_elasticsearch.py
@@ -0,0 +1,114 @@
+
+import json
+import pytest
+from fatcat_tools import *
+from fatcat_openapi_client import *
+from fixtures import api
+from import_journal_metadata import journal_metadata_importer
+
+from import_crossref import crossref_importer
+from import_matched import matched_importer
+
+def test_basic_elasticsearch_convert(crossref_importer):
+    with open('tests/files/crossref-works.single.json', 'r') as f:
+        # not a single line
+        raw = json.loads(f.read())
+        r = crossref_importer.parse_record(raw)
+    r.state = 'active'
+    release_to_elasticsearch(r)
+
+def test_rich_elasticsearch_convert():
+    r = ReleaseEntity(
+        title="something",
+        release_year=1234,
+        license_slug="CC-BY-NC",
+        ext_ids=ReleaseExtIds(),
+        refs=[
+            ReleaseRef(),
+            ReleaseRef(target_release_id="iznnn644szdwva7khyxqzc73bi"),
+        ],
+    )
+    r.state = 'active'
+    r.container = ContainerEntity(
+        name="dummy journal",
+        extra={
+            "ia": {
+                "sim": {
+                    "year_spans": [[1000, 1100]],
+                },
+            },
+            "kbart": {
+                "lockss": {
+                    "year_spans": [[1200, 1300]],
+                },
+                "jstor": {
+                    "year_spans": [[1950, 1960], [1980, 2005]],
+                },
+            },
+            "sherpa_romeo": {"color": "blue"},
+            "doaj": {"as_of": "2010-02-03"},
+        },
+    )
+    r.files = [FileEntity(
+        mimetype="application/pdf",
+        urls=[
+            FileUrl(rel="dweb", url="dat://a954329dlk/thingie"),
+            FileUrl(rel="webarchive", url="https://web.archive.org/web/20001122030405/http://example.com"),
+            FileUrl(rel="web", url="https://archive.org/details/blah/file.pdf"),
+        ],
+        extra={
+            "shadows": {},
+        },
+    )]
+    es = release_to_elasticsearch(r)
+    assert es['release_year'] == r.release_year
+    assert es['in_ia'] == True
+    assert es['in_jstor'] == False
+    assert es['in_ia_sim'] == False
+    assert es['in_ia'] == True
+    assert es['in_web'] == True
+    assert es['in_dweb'] == True
+    assert es['is_oa'] == True
+    assert es['is_longtail_oa'] == False
+    assert es['ref_count'] == 2
+    assert es['ref_linked_count'] == 1
+
+def test_elasticsearch_release_from_json():
+    r = entity_from_json(open('./tests/files/release_etodop5banbndg3faecnfm6ozi.json', 'r').read(), ReleaseEntity)
+    es = release_to_elasticsearch(r)
+
+    assert es['subtitle'] == "Correpondence"
+    assert es['ident'] == "etodop5banbndg3faecnfm6ozi"
+    assert es['container_name'] == "BJOG: an International Journal of Obstetrics and Gynaecology"
+    assert es['first_page'] == "1404"
+    assert es['issue'] == "11"
+    assert es['volume'] == "118"
+    assert es['number'] == None
+    assert es['in_ia_sim'] == True
+    assert es['in_kbart'] == True
+
+def test_elasticsearch_container_transform(journal_metadata_importer):
+    with open('tests/files/journal_metadata.sample.json', 'r') as f:
+        raw = json.loads(f.readline())
+        c = journal_metadata_importer.parse_record(raw)
+    c.state = 'active'
+    es = container_to_elasticsearch(c)
+    assert es['publisher'] == c.publisher
+
+def test_elasticsearch_file_transform(matched_importer):
+    f = entity_from_json(open('./tests/files/file_bcah4zp5tvdhjl5bqci2c2lgfa.json', 'r').read(), FileEntity)
+
+    f.state = 'active'
+    es = file_to_elasticsearch(f)
+    assert es['sha1'] == f.sha1
+    assert es['sha256'] == f.sha256
+    assert es['md5'] == f.md5
+    assert es['size_bytes'] == f.size
+    assert es['mimetype'] == f.mimetype
+    assert es['in_ia'] == True
+    assert 'publisher' in es['rel']
+
+    # XXX: implement hosts and domain parsing with urlcanon
+    #assert 'journals.plos.org' in es['host']
+    #assert 'plos.org' in es['domain']
+
-- 
cgit v1.2.3


From d58c3891ac2122dac53ced606568108f543f2d80 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 21:52:58 -0800
Subject: actually implement changelog transform

---
 extra/elasticsearch/changelog_schema.json       | 11 ++++-
 python/fatcat_tools/transforms/elasticsearch.py | 62 ++++++++++++++++++-------
 python/tests/transform_elasticsearch.py         | 24 +++++++++-
 3 files changed, 78 insertions(+), 19 deletions(-)

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/extra/elasticsearch/changelog_schema.json b/extra/elasticsearch/changelog_schema.json
index f3211e99..77c77238 100644
--- a/extra/elasticsearch/changelog_schema.json
+++ b/extra/elasticsearch/changelog_schema.json
@@ -16,20 +16,29 @@
     "changelog": {
         "properties": {
             "index":            { "type": "integer" },
-            "editgroup_id":     { "type": "keyword" },
+            "editgroup_id":     { "type": "keyword", "doc_values": false },
             "timestamp":        { "type": "date" },
             "editor_id":        { "type": "keyword" },
             "username":         { "type": "keyword" },
             "is_bot":           { "type": "boolean" },
             "is_admin":         { "type": "boolean" },
             "agent":            { "type": "keyword" },
+
             "containers":       { "type": "integer" },
+            "new_containers":   { "type": "integer" },
             "creators":         { "type": "integer" },
+            "new_creators":     { "type": "integer" },
             "files":            { "type": "integer" },
+            "new_files":        { "type": "integer" },
             "filessets":        { "type": "integer" },
+            "new_filessets":    { "type": "integer" },
             "webcaptures":      { "type": "integer" },
+            "new_webcaptures":  { "type": "integer" },
             "releases":         { "type": "integer" },
+            "new_releases":     { "type": "integer" },
             "works":            { "type": "integer" },
+            "new_works":        { "type": "integer" },
+
             "created":          { "type": "integer" },
             "updated":          { "type": "integer" },
             "deleted":          { "type": "integer" },
diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 812cd1fd..c8547b27 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -401,36 +401,64 @@ def container_to_elasticsearch(entity, force_bool=True):
     return t
 
 
+def _type_of_edit(edit):
+    if edit.revision == None and edit.redirect_ident == None:
+        return 'delete'
+    elif edit.redirect_ident:
+        # redirect
+        return 'update'
+    elif edit.prev_revision == None and edit.redirect_ident == None and edit.revision:
+        return 'create'
+    else:
+        return 'update'
+
+
 def changelog_to_elasticsearch(entity):
 
     editgroup = entity.editgroup
     t = dict(
         index=entity.index,
         editgroup_id=entity.editgroup_id,
-        timestamp=entity.timestamp,
+        timestamp=entity.timestamp.isoformat(),
         editor_id=editgroup.editor_id,
+        username=editgroup.editor.username,
+        is_bot=editgroup.editor.is_bot,
+        is_admin=editgroup.editor.is_admin,
     )
 
     extra = editgroup.extra or dict()
     if extra.get('agent'):
         t['agent'] = extra['agent']
 
-    t['containers'] = len(editgroup.edits.containers)
-    t['creators'] = len(editgroup.edits.containers)
-    t['files'] = len(editgroup.edits.containers)
-    t['filesets'] = len(editgroup.edits.containers)
-    t['webcaptures'] = len(editgroup.edits.containers)
-    t['releases'] = len(editgroup.edits.containers)
-    t['works'] = len(editgroup.edits.containers)
-
-    # TODO: parse and pull out counts
-    #created = 0
-    #updated = 0
-    #deleted = 0
-    #t['created'] = created
-    #t['updated'] = updated
-    #t['deleted'] = deleted
-    #t['total'] = created + updated + deleted
+    containers = [_type_of_edit(e) for e in editgroup.edits.containers]
+    creators = [_type_of_edit(e) for e in editgroup.edits.creators]
+    files = [_type_of_edit(e) for e in editgroup.edits.files]
+    filesets = [_type_of_edit(e) for e in editgroup.edits.filesets]
+    webcaptures = [_type_of_edit(e) for e in editgroup.edits.webcaptures]
+    releases = [_type_of_edit(e) for e in editgroup.edits.releases]
+    works = [_type_of_edit(e) for e in editgroup.edits.works]
+
+    t['containers'] = len(containers)
+    t['new_containers'] = len([e for e in containers if e == 'create'])
+    t['creators'] = len(creators)
+    t['new_creators'] = len([e for e in creators if e == 'create'])
+    t['files'] = len(files)
+    t['new_files'] = len([e for e in files if e == 'create'])
+    t['filesets'] = len(filesets)
+    t['new_filesets'] = len([e for e in filesets if e == 'create'])
+    t['webcaptures'] = len(webcaptures)
+    t['new_webcaptures'] = len([e for e in webcaptures if e == 'create'])
+    t['releases'] = len(releases)
+    t['new_releases'] = len([e for e in releases if e == 'create'])
+    t['works'] = len(works)
+    t['new_works'] = len([e for e in works if e == 'create'])
+
+    all_edits = containers + creators + files + filesets + webcaptures + releases + works
+
+    t['created'] = len([e for e in all_edits if e == 'create'])
+    t['updated'] = len([e for e in all_edits if e == 'update'])
+    t['deleted'] = len([e for e in all_edits if e == 'delete'])
+    t['total'] = len(all_edits)
     return t
 
 
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index ab613a0a..89a4eef8 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,9 +106,31 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'publisher' in es['rel']
+    assert 'web' in es['rel']
 
     # XXX: implement hosts and domain parsing with urlcanon
     #assert 'journals.plos.org' in es['host']
     #assert 'plos.org' in es['domain']
 
+def test_elasticsearch_changelog_transform(matched_importer):
+    ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
+
+    es = changelog_to_elasticsearch(ce)
+    assert es['index'] == 3469683
+    # len("2020-01-30T05:04:39") => 19
+    assert es['timestamp'][:19] == "2020-01-30T05:04:39.738601Z"[:19]
+    assert es['editor_id'] == "scmbogxw25evtcesfcab5qaboa"
+    assert es['username'] == "crawl-bot"
+    assert es['is_bot'] == True
+    assert es['is_admin'] == True
+    assert es['agent'] == "fatcat_tools.IngestFileResultImporter"
+
+    assert es['total'] == 50
+    assert es['files'] == 50
+    assert es['new_files'] == 50
+    assert es['created'] == 50
+
+    assert es['releases'] == 0
+    assert es['new_releases'] == 0
+    assert es['updated'] == 0
+    assert es['deleted'] == 0
-- 
cgit v1.2.3


From ade1eb9ff955ca5ba58acdc8b76e344c9cc54790 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Wed, 29 Jan 2020 23:56:27 -0800
Subject: fix ES file schema plural field names

---
 python/fatcat_tools/transforms/elasticsearch.py | 7 +++----
 python/tests/transform_elasticsearch.py         | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 42669bbf..5a492fb4 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -497,13 +497,12 @@ def file_to_elasticsearch(entity):
         sha1 = entity.sha1,
         sha256 = entity.sha256,
         md5 = entity.md5,
-        rel = [u.rel for u in entity.urls],
     )
 
     # TODO: domain, hosts (from urls; use proper urlcanon)
-    t['rel'] = list(set([u.rel for u in entity.urls]))
-    t['host'] = []
-    t['domain'] = []
+    t['rels'] = list(set([u.rel for u in entity.urls]))
+    t['hosts'] = []
+    t['domains'] = []
 
     in_ia = False
     for u in entity.urls:
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index 89a4eef8..c247e745 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,7 +106,7 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'web' in es['rel']
+    assert 'web' in es['rels']
 
     # XXX: implement hosts and domain parsing with urlcanon
     #assert 'journals.plos.org' in es['host']
-- 
cgit v1.2.3


From 4cbee44529dd967c966ed3f2cc2bb80176be4e43 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Thu, 30 Jan 2020 00:08:41 -0800
Subject: implement host+domain parsing for file ES transform

---
 python/fatcat_tools/transforms/elasticsearch.py | 14 +++++---------
 python/tests/transform_elasticsearch.py         |  7 +++----
 2 files changed, 8 insertions(+), 13 deletions(-)

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index 5a492fb4..e1980d90 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -1,6 +1,6 @@
 
-
 import collections
+import tldextract
 from fatcat_openapi_client import ApiClient
 
 
@@ -499,15 +499,11 @@ def file_to_elasticsearch(entity):
         md5 = entity.md5,
     )
 
-    # TODO: domain, hosts (from urls; use proper urlcanon)
+    parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
+    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
-    t['hosts'] = []
-    t['domains'] = []
 
-    in_ia = False
-    for u in entity.urls:
-        if '://archive.org/' in u.url or '://web.archive.org/' in u.url:
-            in_ia = True
-    t['in_ia'] = bool(in_ia)
+    t['in_ia'] = bool('archive.org' in t['domains'])
 
     return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index c247e745..e67681c6 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -106,11 +106,10 @@ def test_elasticsearch_file_transform(matched_importer):
     assert es['size_bytes'] == f.size
     assert es['mimetype'] == f.mimetype
     assert es['in_ia'] == True
-    assert 'web' in es['rels']
 
-    # XXX: implement hosts and domain parsing with urlcanon
-    #assert 'journals.plos.org' in es['host']
-    #assert 'plos.org' in es['domain']
+    assert 'web' in es['rels']
+    assert 'www.zhros.ru' in es['hosts']
+    assert 'zhros.ru' in es['domains']
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
-- 
cgit v1.2.3


From 741c7b1efae5e39f3ee2c082e3ca28c6c5c85b76 Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 31 Jan 2020 13:31:59 -0800
Subject: ES releases: host/domain fixes

---
 python/fatcat_tools/transforms/elasticsearch.py | 4 ++--
 python/tests/transform_elasticsearch.py         | 3 +++
 2 files changed, 5 insertions(+), 2 deletions(-)

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/python/fatcat_tools/transforms/elasticsearch.py b/python/fatcat_tools/transforms/elasticsearch.py
index b5abe2ae..f8bc05fb 100644
--- a/python/fatcat_tools/transforms/elasticsearch.py
+++ b/python/fatcat_tools/transforms/elasticsearch.py
@@ -502,7 +502,7 @@ def file_to_elasticsearch(entity):
     )
 
     parsed_urls = [tldextract.extract(u.url) for u in entity.urls]
-    t['hosts'] = list(set(['.'.join(pu) for pu in parsed_urls]))
+    t['hosts'] = list(set(['.'.join([seg for seg in pu if seg]) for pu in parsed_urls]))
     t['domains'] = list(set([pu.registered_domain for pu in parsed_urls]))
     t['rels'] = list(set([u.rel for u in entity.urls]))
 
@@ -512,6 +512,6 @@ def file_to_elasticsearch(entity):
     # ok, but actually remove archive.org hosts, because they make other
     # aggregations hard and are a waste of storage
     t['hosts'] = [h for h in t['hosts'] if h not in ('archive.org', 'web.archive.org')]
-    t['domains'] = [h for h in t['hosts'] if h not in ('archive.org')]
+    t['domains'] = [h for h in t['domains'] if h not in ('archive.org')]
 
     return t
diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index e67681c6..c94ab375 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -110,6 +110,9 @@ def test_elasticsearch_file_transform(matched_importer):
     assert 'web' in es['rels']
     assert 'www.zhros.ru' in es['hosts']
     assert 'zhros.ru' in es['domains']
+    assert not '.archive.org' in (es['hosts'] + es['domains'])
+    assert not 'archive.org' in (es['hosts'] + es['domains'])
+    assert not 'web.archive.org' in (es['hosts'] + es['domains'])
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
-- 
cgit v1.2.3


From ed38bfde4e1eaddd6d710802b6f372c7b0aab26b Mon Sep 17 00:00:00 2001
From: Bryan Newbold <bnewbold@robocracy.org>
Date: Fri, 14 Feb 2020 00:07:56 -0800
Subject: ES updates: fix tests to accept archive.org in host/domain

---
 python/tests/transform_elasticsearch.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

(limited to 'python/tests/transform_elasticsearch.py')

diff --git a/python/tests/transform_elasticsearch.py b/python/tests/transform_elasticsearch.py
index c94ab375..a954fc4d 100644
--- a/python/tests/transform_elasticsearch.py
+++ b/python/tests/transform_elasticsearch.py
@@ -110,9 +110,10 @@ def test_elasticsearch_file_transform(matched_importer):
     assert 'web' in es['rels']
     assert 'www.zhros.ru' in es['hosts']
     assert 'zhros.ru' in es['domains']
+    assert 'archive.org' in (es['hosts'] + es['domains'])
+    assert 'web.archive.org' in (es['hosts'] + es['domains'])
+    # old regression
     assert not '.archive.org' in (es['hosts'] + es['domains'])
-    assert not 'archive.org' in (es['hosts'] + es['domains'])
-    assert not 'web.archive.org' in (es['hosts'] + es['domains'])
 
 def test_elasticsearch_changelog_transform(matched_importer):
     ce = entity_from_json(open('./tests/files/changelog_3469683.json', 'r').read(), ChangelogEntry)
-- 
cgit v1.2.3