From b03bfc8f3fd84141738f775b273a99850d78e1ff Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 12 Nov 2018 23:18:56 -0800 Subject: refactor python modules --- python/config.py | 20 -- python/fatcat/__init__.py | 18 - python/fatcat/changelog_workers.py | 122 ------- python/fatcat/crossref_importer.py | 272 --------------- python/fatcat/elastic_workers.py | 47 --- python/fatcat/entity_helpers.py | 100 ------ python/fatcat/fcid.py | 17 - python/fatcat/grobid_metadata_importer.py | 168 ---------- python/fatcat/importer_common.py | 137 -------- python/fatcat/issn_importer.py | 72 ---- python/fatcat/matched_importer.py | 144 -------- python/fatcat/orcid_importer.py | 73 ----- python/fatcat/raw_api_client.py | 66 ---- python/fatcat/routes.py | 364 --------------------- python/fatcat/search.py | 60 ---- python/fatcat/static/fatcat.jpg | Bin 86240 -> 0 bytes python/fatcat/static/robots.txt | 1 - python/fatcat/templates/404.html | 6 - python/fatcat/templates/about.html | 190 ----------- python/fatcat/templates/base.html | 78 ----- python/fatcat/templates/changelog.html | 25 -- python/fatcat/templates/changelog_view.html | 13 - python/fatcat/templates/container_create.html | 168 ---------- python/fatcat/templates/container_view.html | 108 ------ python/fatcat/templates/creator_view.html | 82 ----- python/fatcat/templates/editgroup_view.html | 54 --- python/fatcat/templates/editor_changelog.html | 29 -- python/fatcat/templates/editor_view.html | 12 - python/fatcat/templates/entity_edit.html | 8 - python/fatcat/templates/entity_history.html | 30 -- python/fatcat/templates/file_view.html | 108 ------ python/fatcat/templates/home.html | 91 ------ python/fatcat/templates/release_changelog.html | 17 - python/fatcat/templates/release_create.html | 215 ------------ python/fatcat/templates/release_search.html | 64 ---- python/fatcat/templates/release_view.html | 290 ---------------- python/fatcat/templates/stats.html | 104 ------ python/fatcat/templates/work_view.html | 72 ---- python/fatcat/worker_common.py | 25 -- python/fatcat_tools/changelog_workers.py | 122 +++++++ python/fatcat_tools/crossref_importer.py | 272 +++++++++++++++ python/fatcat_tools/elastic_workers.py | 47 +++ python/fatcat_tools/entity_helpers.py | 100 ++++++ python/fatcat_tools/fcid.py | 17 + python/fatcat_tools/grobid_metadata_importer.py | 168 ++++++++++ python/fatcat_tools/importer_common.py | 137 ++++++++ python/fatcat_tools/issn_importer.py | 72 ++++ python/fatcat_tools/matched_importer.py | 144 ++++++++ python/fatcat_tools/orcid_importer.py | 73 +++++ python/fatcat_tools/raw_api_client.py | 66 ++++ python/fatcat_tools/worker_common.py | 25 ++ python/fatcat_web/__init__.py | 18 + python/fatcat_web/routes.py | 364 +++++++++++++++++++++ python/fatcat_web/search.py | 60 ++++ python/fatcat_web/static/fatcat.jpg | Bin 0 -> 86240 bytes python/fatcat_web/static/robots.txt | 1 + python/fatcat_web/templates/404.html | 6 + python/fatcat_web/templates/about.html | 190 +++++++++++ python/fatcat_web/templates/base.html | 78 +++++ python/fatcat_web/templates/changelog.html | 25 ++ python/fatcat_web/templates/changelog_view.html | 13 + python/fatcat_web/templates/container_create.html | 168 ++++++++++ python/fatcat_web/templates/container_view.html | 108 ++++++ python/fatcat_web/templates/creator_view.html | 82 +++++ python/fatcat_web/templates/editgroup_view.html | 54 +++ python/fatcat_web/templates/editor_changelog.html | 29 ++ python/fatcat_web/templates/editor_view.html | 12 + python/fatcat_web/templates/entity_edit.html | 8 + python/fatcat_web/templates/entity_history.html | 30 ++ python/fatcat_web/templates/file_view.html | 108 ++++++ python/fatcat_web/templates/home.html | 91 ++++++ python/fatcat_web/templates/release_changelog.html | 17 + python/fatcat_web/templates/release_create.html | 215 ++++++++++++ python/fatcat_web/templates/release_search.html | 64 ++++ python/fatcat_web/templates/release_view.html | 290 ++++++++++++++++ python/fatcat_web/templates/stats.html | 104 ++++++ python/fatcat_web/templates/work_view.html | 72 ++++ python/webface_config.py | 20 ++ 78 files changed, 3470 insertions(+), 3470 deletions(-) delete mode 100644 python/config.py delete mode 100644 python/fatcat/__init__.py delete mode 100644 python/fatcat/changelog_workers.py delete mode 100644 python/fatcat/crossref_importer.py delete mode 100644 python/fatcat/elastic_workers.py delete mode 100644 python/fatcat/entity_helpers.py delete mode 100644 python/fatcat/fcid.py delete mode 100755 python/fatcat/grobid_metadata_importer.py delete mode 100644 python/fatcat/importer_common.py delete mode 100644 python/fatcat/issn_importer.py delete mode 100644 python/fatcat/matched_importer.py delete mode 100644 python/fatcat/orcid_importer.py delete mode 100644 python/fatcat/raw_api_client.py delete mode 100644 python/fatcat/routes.py delete mode 100644 python/fatcat/search.py delete mode 100644 python/fatcat/static/fatcat.jpg delete mode 100644 python/fatcat/static/robots.txt delete mode 100644 python/fatcat/templates/404.html delete mode 100644 python/fatcat/templates/about.html delete mode 100644 python/fatcat/templates/base.html delete mode 100644 python/fatcat/templates/changelog.html delete mode 100644 python/fatcat/templates/changelog_view.html delete mode 100644 python/fatcat/templates/container_create.html delete mode 100644 python/fatcat/templates/container_view.html delete mode 100644 python/fatcat/templates/creator_view.html delete mode 100644 python/fatcat/templates/editgroup_view.html delete mode 100644 python/fatcat/templates/editor_changelog.html delete mode 100644 python/fatcat/templates/editor_view.html delete mode 100644 python/fatcat/templates/entity_edit.html delete mode 100644 python/fatcat/templates/entity_history.html delete mode 100644 python/fatcat/templates/file_view.html delete mode 100644 python/fatcat/templates/home.html delete mode 100644 python/fatcat/templates/release_changelog.html delete mode 100644 python/fatcat/templates/release_create.html delete mode 100644 python/fatcat/templates/release_search.html delete mode 100644 python/fatcat/templates/release_view.html delete mode 100644 python/fatcat/templates/stats.html delete mode 100644 python/fatcat/templates/work_view.html delete mode 100644 python/fatcat/worker_common.py create mode 100644 python/fatcat_tools/changelog_workers.py create mode 100644 python/fatcat_tools/crossref_importer.py create mode 100644 python/fatcat_tools/elastic_workers.py create mode 100644 python/fatcat_tools/entity_helpers.py create mode 100644 python/fatcat_tools/fcid.py create mode 100755 python/fatcat_tools/grobid_metadata_importer.py create mode 100644 python/fatcat_tools/importer_common.py create mode 100644 python/fatcat_tools/issn_importer.py create mode 100644 python/fatcat_tools/matched_importer.py create mode 100644 python/fatcat_tools/orcid_importer.py create mode 100644 python/fatcat_tools/raw_api_client.py create mode 100644 python/fatcat_tools/worker_common.py create mode 100644 python/fatcat_web/__init__.py create mode 100644 python/fatcat_web/routes.py create mode 100644 python/fatcat_web/search.py create mode 100644 python/fatcat_web/static/fatcat.jpg create mode 100644 python/fatcat_web/static/robots.txt create mode 100644 python/fatcat_web/templates/404.html create mode 100644 python/fatcat_web/templates/about.html create mode 100644 python/fatcat_web/templates/base.html create mode 100644 python/fatcat_web/templates/changelog.html create mode 100644 python/fatcat_web/templates/changelog_view.html create mode 100644 python/fatcat_web/templates/container_create.html create mode 100644 python/fatcat_web/templates/container_view.html create mode 100644 python/fatcat_web/templates/creator_view.html create mode 100644 python/fatcat_web/templates/editgroup_view.html create mode 100644 python/fatcat_web/templates/editor_changelog.html create mode 100644 python/fatcat_web/templates/editor_view.html create mode 100644 python/fatcat_web/templates/entity_edit.html create mode 100644 python/fatcat_web/templates/entity_history.html create mode 100644 python/fatcat_web/templates/file_view.html create mode 100644 python/fatcat_web/templates/home.html create mode 100644 python/fatcat_web/templates/release_changelog.html create mode 100644 python/fatcat_web/templates/release_create.html create mode 100644 python/fatcat_web/templates/release_search.html create mode 100644 python/fatcat_web/templates/release_view.html create mode 100644 python/fatcat_web/templates/stats.html create mode 100644 python/fatcat_web/templates/work_view.html create mode 100644 python/webface_config.py diff --git a/python/config.py b/python/config.py deleted file mode 100644 index 3d6db049..00000000 --- a/python/config.py +++ /dev/null @@ -1,20 +0,0 @@ - -import os -import subprocess - -basedir = os.path.abspath(os.path.dirname(__file__)) - -class Config(object): - SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URI') or \ - 'sqlite:///' + os.path.join(basedir, 'fatcat_dev.sqlite') - SQLALCHEMY_TRACK_MODIFICATIONS = False - GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip() - # This is, effectively, the QA/PROD flag - FATCAT_DOMAIN = "qa.fatcat.wiki" - ELASTIC_BACKEND = "https://search.fatcat.wiki" - ELASTIC_INDEX = "fatcat" - - # "Event more verbose" debug options. SECRET_KEY is bogus. - #SQLALCHEMY_ECHO = True - #SECRET_KEY = "kuhy0284hflskjhg01284" - #DEBUG = True diff --git a/python/fatcat/__init__.py b/python/fatcat/__init__.py deleted file mode 100644 index aa12f972..00000000 --- a/python/fatcat/__init__.py +++ /dev/null @@ -1,18 +0,0 @@ - -from flask import Flask -from flask_uuid import FlaskUUID -from flask_debugtoolbar import DebugToolbarExtension -from config import Config -import fatcat_client - -toolbar = DebugToolbarExtension() -app = Flask(__name__) -app.config.from_object(Config) -toolbar = DebugToolbarExtension(app) -FlaskUUID(app) - -conf = fatcat_client.Configuration() -conf.host = "http://localhost:9411/v0" -api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) - -from fatcat import routes diff --git a/python/fatcat/changelog_workers.py b/python/fatcat/changelog_workers.py deleted file mode 100644 index e341ea32..00000000 --- a/python/fatcat/changelog_workers.py +++ /dev/null @@ -1,122 +0,0 @@ - -import json -import time -from itertools import islice -from fatcat.worker_common import FatcatWorker -from pykafka.common import OffsetType - - -class FatcatChangelogWorker(FatcatWorker): - """ - Periodically polls the fatcat API looking for new changelogs. When they are - found, fetch them and push (as JSON) into a Kafka topic. - """ - - def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): - # TODO: should be offset=0 - super().__init__(kafka_hosts=kafka_hosts, - produce_topic=produce_topic, - api_host_url=api_host_url) - self.poll_interval = poll_interval - self.offset = offset # the fatcat changelog offset, not the kafka offset - - def most_recent_message(self, topic): - """ - Tries to fetch the most recent message from a given topic. - This only makes sense for single partition topics, though could be - extended with "last N" behavior. - - Following "Consuming the last N messages from a topic" - from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns - """ - consumer = topic.get_simple_consumer( - auto_offset_reset=OffsetType.LATEST, - reset_offset_on_start=True) - offsets = [(p, op.last_offset_consumed - 1) - for p, op in consumer._partitions.items()] - offsets = [(p, (o if o > -1 else -2)) for p, o in offsets] - if -2 in [o for p, o in offsets]: - return None - else: - consumer.reset_offsets(offsets) - msg = islice(consumer, 1) - if msg: - return list(msg)[0].value - else: - return None - - def run(self): - topic = self.kafka.topics[self.produce_topic] - # On start, try to consume the most recent from the topic, and using - # that as the starting offset. Note that this is a single-partition - # topic - if self.offset is None: - print("Checking for most recent changelog offset...") - msg = self.most_recent_message(topic) - if msg: - self.offset = json.loads(msg.decode('utf-8'))['index'] - else: - self.offset = 1 - - with topic.get_sync_producer() as producer: - while True: - latest = int(self.api.get_changelog(limit=1)[0].index) - if latest > self.offset: - print("Fetching changelogs from {} through {}".format( - self.offset+1, latest)) - for i in range(self.offset+1, latest+1): - cle = self.api.get_changelog_entry(i) - obj = self.api.api_client.sanitize_for_serialization(cle) - producer.produce( - message=json.dumps(obj).encode('utf-8'), - partition_key=None, - timestamp=None, - #XXX: timestamp=cle.timestamp, - ) - self.offset = i - print("Sleeping {} seconds...".format(self.poll_interval)) - time.sleep(self.poll_interval) - - -class FatcatEntityUpdatesWorker(FatcatWorker): - """ - Consumes from the changelog topic and publishes expanded entities (fetched - from API) to update topics. - - For now, only release updates are published. - """ - - def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic, - api_host_url=api_host_url) - self.release_topic = release_topic - self.consumer_group = "entity-updates" - - def run(self): - changelog_topic = self.kafka.topics[self.consume_topic] - release_topic = self.kafka.topics[self.release_topic] - - consumer = changelog_topic.get_balanced_consumer( - consumer_group=self.consumer_group, - managed=True, - auto_offset_reset=OffsetType.LATEST, - reset_offset_on_start=False, - ) - - with release_topic.get_sync_producer() as producer: - for msg in consumer: - cle = json.loads(msg.value.decode('utf-8')) - #print(cle) - release_edits = cle['editgroup']['edits']['releases'] - for re in release_edits: - ident = re['ident'] - release = self.api.get_release(ident, expand="files,container") - release_dict = self.api.api_client.sanitize_for_serialization(release) - producer.produce( - message=json.dumps(release_dict).encode('utf-8'), - partition_key=ident.encode('utf-8'), - timestamp=None, - ) - consumer.commit_offsets() - diff --git a/python/fatcat/crossref_importer.py b/python/fatcat/crossref_importer.py deleted file mode 100644 index 37005965..00000000 --- a/python/fatcat/crossref_importer.py +++ /dev/null @@ -1,272 +0,0 @@ - -import sys -import json -import sqlite3 -import datetime -import itertools -import fatcat_client -from fatcat.importer_common import FatcatImporter - - -class FatcatCrossrefImporter(FatcatImporter): - - def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): - super().__init__(host_url, issn_map_file) - self.extid_map_db = None - if extid_map_file: - db_uri = "file:{}?mode=ro".format(extid_map_file) - print("Using external ID map: {}".format(db_uri)) - self.extid_map_db = sqlite3.connect(db_uri, uri=True) - else: - print("Not using external ID map") - self.create_containers = create_containers - - def lookup_ext_ids(self, doi): - if self.extid_map_db is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) - row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", - [doi.lower()]).fetchone() - if row is None: - return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) - row = [str(cell or '') or None for cell in row] - return dict( - core_id=row[0], - pmid=row[1], - pmcid=row[2], - wikidata_qid=row[3]) - - def parse_crossref_dict(self, obj): - """ - obj is a python dict (parsed from json). - returns a ReleaseEntity - """ - - # This work is out of scope if it doesn't have authors and a title - if (not 'author' in obj) or (not 'title' in obj): - return None - - # Other ways to be out of scope (provisionally) - if (not 'type' in obj): - return None - - # contribs - def do_contribs(obj_list, ctype): - contribs = [] - for i, am in enumerate(obj_list): - creator_id = None - if 'ORCID' in am.keys(): - creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) - # Sorry humans :( - if am.get('given') and am.get('family'): - raw_name = "{} {}".format(am['given'], am['family']) - elif am.get('family'): - raw_name = am['family'] - else: - # TODO: defaults back to a pseudo-null value - raw_name = am.get('given', '') - extra = dict() - if ctype == "author": - index = i - else: - index = None - if am.get('affiliation'): - # note: affiliation => affiliations - extra['affiliations'] = am.get('affiliation') - if am.get('sequence') and am.get('sequence') != "additional": - extra['sequence'] = am.get('sequence') - if not extra: - extra = None - contribs.append(fatcat_client.ReleaseContrib( - creator_id=creator_id, - index=index, - raw_name=raw_name, - role=ctype, - extra=extra)) - return contribs - contribs = do_contribs(obj['author'], "author") - contribs.extend(do_contribs(obj.get('editor', []), "editor")) - contribs.extend(do_contribs(obj.get('translator', []), "translator")) - - # container - issn = obj.get('ISSN', [None])[0] - issnl = self.issn2issnl(issn) - container_id = None - if issnl: - container_id = self.lookup_issnl(issnl) - publisher = obj.get('publisher') - - ce = None - if (container_id is None and self.create_containers and issnl != None - and obj.get('container-title') and len(obj['container-title']) > 0): - ce = fatcat_client.ContainerEntity( - issnl=issnl, - publisher=publisher, - name=obj['container-title'][0]) - - # references - refs = [] - for i, rm in enumerate(obj.get('reference', [])): - try: - year = int(rm.get('year')) - # NOTE: will need to update/config in the future! - # NOTE: are there crossref works with year < 100? - if year > 2025 or year < 100: - year = None - except: - year = None - extra = rm.copy() - if rm.get('DOI'): - extra['doi'] = rm.get('DOI').lower() - key = rm.get('key') - if key and key.startswith(obj['DOI'].upper()): - key = key.replace(obj['DOI'].upper() + "-", '') - key = key.replace(obj['DOI'].upper(), '') - container_name = rm.get('volume-title') - if not container_name: - container_name = rm.get('journal-title') - extra.pop('DOI', None) - extra.pop('key', None) - extra.pop('year', None) - extra.pop('volume-name', None) - extra.pop('journal-title', None) - extra.pop('title', None) - extra.pop('first-page', None) - extra.pop('doi-asserted-by', None) - if extra: - extra = dict(crossref=extra) - else: - extra = None - refs.append(fatcat_client.ReleaseRef( - index=i, - # doing lookups would be a second import pass - target_release_id=None, - key=key, - year=year, - container_name=container_name, - title=rm.get('title'), - locator=rm.get('first-page'), - # TODO: just dump JSON somewhere here? - extra=extra)) - - # abstracts - abstracts = [] - if obj.get('abstract') != None: - abstracts.append(fatcat_client.ReleaseEntityAbstracts( - mimetype="application/xml+jats", - content=obj.get('abstract'))) - - # extra fields - extra = dict() - for key in ('subject', 'type', 'license', 'alternative-id', - 'container-title', 'original-title', 'subtitle', 'archive', - 'funder', 'group-title'): - # TODO: unpack "container-title" array - val = obj.get(key) - if val: - extra[key] = val - if 'license' in extra and extra['license']: - for i in range(len(extra['license'])): - if 'start' in extra['license'][i]: - extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] - if len(obj['title']) > 1: - extra['other-titles'] = obj['title'][1:] - # TODO: this should be top-level - extra['is_kept'] = len(obj.get('archive', [])) > 0 - - # ISBN - isbn13 = None - for raw in obj.get('ISBN', []): - # TODO: convert if not ISBN-13 format - if len(raw) == 17: - isbn13 = raw - break - - # release status - if obj['type'] in ('journal-article', 'conference-proceeding', 'book', - 'dissertation', 'book-chapter'): - release_status = "published" - else: - # unknown - release_status = None - - # external identifiers - extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) - - # TODO: filter out huge releases; we'll get them later (and fix bug in - # fatcatd) - if max(len(contribs), len(refs), len(abstracts)) > 750: - return None - - # release date parsing is amazingly complex - release_date = obj['issued']['date-parts'][0] - if not release_date or not release_date[0]: - # got some NoneType, even though at least year is supposed to be set - release_date = None - elif len(release_date) == 3: - release_date = datetime.datetime(year=release_date[0], month=release_date[1], day=release_date[2]) - else: - # only the year is actually required; mangle to first day for date - # (TODO: something better?) - release_date = datetime.datetime(year=release_date[0], month=1, day=1) - # convert to string ISO datetime format (if not null) - if release_date: - release_date = release_date.isoformat() + "Z" - - re = fatcat_client.ReleaseEntity( - work_id=None, - title=obj['title'][0], - contribs=contribs, - refs=refs, - container_id=container_id, - publisher=publisher, - release_type=obj['type'], - release_status=release_status, - doi=obj['DOI'].lower(), - isbn13=isbn13, - core_id=extids['core_id'], - pmid=extids['pmid'], - pmcid=extids['pmcid'], - wikidata_qid=extids['wikidata_qid'], - release_date=release_date, - issue=obj.get('issue'), - volume=obj.get('volume'), - pages=obj.get('page'), - abstracts=abstracts, - extra=dict(crossref=extra)) - return (re, ce) - - def create_row(self, row, editgroup=None): - if row is None: - return - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - container = self.api.create_container(ce, editgroup=editgroup) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - self.api.create_release(re, editgroup=editgroup) - self.insert_count = self.insert_count + 1 - - def create_batch(self, batch, editgroup=None): - """Current work/release pairing disallows batch creation of releases. - Could do batch work creation and then match against releases, but meh.""" - release_batch = [] - for row in batch: - if row is None: - continue - obj = json.loads(row) - entities = self.parse_crossref_dict(obj) - if entities is not None: - (re, ce) = entities - if ce is not None: - ce_eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - container = self.api.create_container(ce, editgroup=ce_eg.id) - self.api.accept_editgroup(ce_eg.id) - re.container_id = container.ident - self._issnl_id_map[ce.issnl] = container.ident - release_batch.append(re) - self.api.create_release_batch(release_batch, autoaccept="true", editgroup=editgroup) - self.insert_count = self.insert_count + len(release_batch) diff --git a/python/fatcat/elastic_workers.py b/python/fatcat/elastic_workers.py deleted file mode 100644 index 3d2e9c39..00000000 --- a/python/fatcat/elastic_workers.py +++ /dev/null @@ -1,47 +0,0 @@ - -import json -import time -import requests -from fatcat.worker_common import FatcatWorker -from fatcat_client.models import ReleaseEntity -from fatcat.entity_helpers import * -from pykafka.common import OffsetType - - -class FatcatElasticReleaseWorker(FatcatWorker): - """ - Consumes from release-updates topic and pushes into (presumably local) - elasticsearch. - - Uses a consumer group to manage offset. - """ - - def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, - elastic_backend="http://localhost:9200", elastic_index="fatcat"): - super().__init__(kafka_hosts=kafka_hosts, - consume_topic=consume_topic, - api_host_url=None) - self.consumer_group = "elastic-updates" - self.elastic_backend = elastic_backend - self.elastic_index = elastic_index - - def run(self): - consume_topic = self.kafka.topics[self.consume_topic] - - consumer = consume_topic.get_balanced_consumer( - consumer_group=self.consumer_group, - managed=True, - ) - - for msg in consumer: - json_str = msg.value.decode('utf-8') - release = entity_from_json(json_str, ReleaseEntity) - #print(release) - elastic_endpoint = "{}/{}/release/{}".format( - self.elastic_backend, - self.elastic_index, - release.ident) - print("Updating document: {}".format(elastic_endpoint)) - resp = requests.post(elastic_endpoint, json=release.to_elastic_dict()) - assert resp.status_code in (200, 201) - consumer.commit_offsets() diff --git a/python/fatcat/entity_helpers.py b/python/fatcat/entity_helpers.py deleted file mode 100644 index c454536b..00000000 --- a/python/fatcat/entity_helpers.py +++ /dev/null @@ -1,100 +0,0 @@ - -import collections -from fatcat_client.models import ReleaseEntity -from fatcat_client.api_client import ApiClient - -def entity_to_json(entity): - ac = ApiClient() - return ac.sanitize_for_serialization(entity) - -def entity_from_json(json_str, entity_type): - """ - Hack to take advantage of the code-generated deserialization code - """ - ac = ApiClient() - thing = collections.namedtuple('Thing', ['data']) - thing.data = json_str - return ac.deserialize(thing, entity_type) - -def release_elastic_dict(release): - """ - Converts from an entity model/schema to elasticsearch oriented schema. - - Returns: dict - """ - - if release.state != 'active': - raise ValueError("Entity is not 'active'") - - # First, the easy ones (direct copy) - t = dict( - ident = release.ident, - revision = release.revision, - title = release.title, - release_type = release.release_type, - release_status = release.release_status, - language = release.language, - doi = release.doi, - pmid = release.pmid, - pmcid = release.pmcid, - isbn13 = release.isbn13, - core_id = release.core_id, - wikidata_qid = release.wikidata_qid - ) - - if release.release_date: - # TODO: resolve why this can be either a string or datetime - if type(release.release_date) == str: - t['release_date'] = release.release_date - else: - t['release_date'] = release.release_date.strftime('%F') - - container = release.container - container_is_kept = False - if container: - t['publisher'] = container.publisher - t['container_name'] = container.name - t['container_issnl'] = container.issnl - container_extra = container.extra - if container_extra: - t['container_is_oa'] = container_extra.get('is_oa') - container_is_kept = container_extra.get('is_kept', False) - t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') - else: - t['publisher'] = release.publisher - - files = release.files or [] - t['file_count'] = len(files) - in_wa = False - in_ia = False - t['file_pdf_url'] = None - for f in files: - is_pdf = 'pdf' in f.get('mimetype', '') - for url in f.get('urls', []): - if url.get('rel', '') == 'webarchive': - in_wa = True - if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: - in_ia = True - if is_pdf: - t['file_pdf_url'] = url['url'] - if not t['file_pdf_url'] and is_pdf: - t['file_pdf_url'] = url['url'] - t['file_in_webarchive'] = in_wa - t['file_in_ia'] = in_ia - - extra = release.extra or dict() - if extra: - t['in_shadow'] = extra.get('in_shadow') - if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): - t['container_is_longtail_oa'] = True - t['any_abstract'] = bool(release.abstracts) - t['is_kept'] = container_is_kept or extra.get('is_kept', False) - - t['ref_count'] = len(release.refs or []) - t['contrib_count'] = len(release.contribs or []) - contrib_names = [] - for c in (release.contribs or []): - if c.raw_name: - contrib_names.append(c.raw_name) - t['contrib_names'] = contrib_names - return t diff --git a/python/fatcat/fcid.py b/python/fatcat/fcid.py deleted file mode 100644 index dd72b242..00000000 --- a/python/fatcat/fcid.py +++ /dev/null @@ -1,17 +0,0 @@ - -import base64 -import uuid - -def fcid2uuid(s): - s = s.split('_')[-1].upper().encode('utf-8') - assert len(s) == 26 - raw = base64.b32decode(s + b"======") - return str(uuid.UUID(bytes=raw)).lower() - -def uuid2fcid(s): - raw = uuid.UUID(s).bytes - return base64.b32encode(raw)[:26].lower().decode('utf-8') - -def test_fcid(): - test_uuid = '00000000-0000-0000-3333-000000000001' - assert test_uuid == fcid2uuid(uuid2fcid(test_uuid)) diff --git a/python/fatcat/grobid_metadata_importer.py b/python/fatcat/grobid_metadata_importer.py deleted file mode 100755 index 95cc285e..00000000 --- a/python/fatcat/grobid_metadata_importer.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 - -import sys -import json -import base64 -import datetime -import fatcat_client -from fatcat.importer_common import FatcatImporter - -MAX_ABSTRACT_BYTES=4096 - - -class FatcatGrobidMetadataImporter(FatcatImporter): - - def __init__(self, host_url, default_link_rel="web"): - super().__init__(host_url) - self.default_link_rel = default_link_rel - - def parse_grobid_json(self, obj): - - if not obj.get('title'): - return None - - release = dict() - extra = dict() - - if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: - abobj = dict( - mimetype="text/plain", - language=None, - content=obj.get('abstract').strip()) - abstracts = [abobj] - else: - abstracts = None - - contribs = [] - for i, a in enumerate(obj.get('authors', [])): - c = dict(raw_name=a['name'], role="author") - contribs.append(fatcat_client.ReleaseContrib( - index=i, - raw_name=a['name'], - role="author", - extra=None)) - - refs = [] - for raw in obj.get('citations', []): - cite_extra = dict() - ref = dict() - ref['key'] = raw.get('id') - if raw.get('title'): - ref['title'] = raw['title'].strip() - if raw.get('date'): - try: - year = int(raw['date'].strip()[:4]) - ref['year'] = year - except: - pass - for key in ('volume', 'url', 'issue', 'publisher'): - if raw.get(key): - cite_extra[key] = raw[key].strip() - if raw.get('authors'): - cite_extra['authors'] = [a['name'] for a in raw['authors']] - if cite_extra: - cite_extra = dict(grobid=cite_extra) - else: - cite_extra = None - ref['extra'] = cite_extra - refs.append(ref) - - release_type = "journal-article" - release_date = None - if obj.get('date'): - # TODO: only returns year, ever? how to handle? - release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1) - - if obj.get('doi'): - extra['doi'] = obj['doi'] - if obj['journal'] and obj['journal'].get('name'): - extra['container_name'] = obj['journal']['name'] - - extra['is_longtail_oa'] = True - - # TODO: ISSN/eISSN handling? or just journal name lookup? - - if extra: - extra = dict(grobid=extra) - else: - extra = None - - re = fatcat_client.ReleaseEntity( - title=obj['title'].strip(), - contribs=contribs, - refs=refs, - publisher=obj['journal'].get('publisher'), - volume=obj['journal'].get('volume'), - issue=obj['journal'].get('issue'), - abstracts=abstracts, - extra=extra) - return re - - # TODO: make this a common function somewhere - def make_url(self, raw): - rel = self.default_link_rel - # TODO: this is where we could map specific domains to rel types, - # and also filter out bad domains, invalid URLs, etc - if "//archive.org/" in raw or "//arxiv.org/" in raw: - # TODO: special-case the arxiv.org bulk mirror? - rel = "repository" - elif "//web.archive.org/" in raw or "//archive.is/" in raw: - rel = "webarchive" - return fatcat_client.FileEntityUrls(url=raw, rel=rel) - - def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): - - sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() - - # lookup existing SHA1, or create new entity - try: - existing_file = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - existing_file = None - - if existing_file: - # if file is already in here, presumably not actually long-tail - return None - fe = fatcat_client.FileEntity( - sha1=sha1, - size=int(file_size), - mimetype=mimetype, - releases=[], - urls=[], - ) - - # parse URLs and CDX - original = cdx['url'] - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - fe.urls.append( - fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) - original_url = self.make_url(original) - if original_url != None: - fe.urls.append(original_url) - - return fe - - def create_row(self, row, editgroup=None): - if not row: - return - fields = row.split('\t') - sha1_key = fields[0] - cdx = json.loads(fields[1]) - mimetype = fields[2] - file_size = int(fields[3]) - grobid_meta = json.loads(fields[4]) - fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) - re = self.parse_grobid_json(grobid_meta) - if fe and re: - release_entity = self.api.create_release(re, editgroup=editgroup) - # release ident can't already be in release list because we just - # created it - fe.releases.append(release_entity.ident) - file_entity = self.api.create_file(fe, editgroup=editgroup) - self.insert_count = self.insert_count + 1 - - # NB: batch mode not implemented diff --git a/python/fatcat/importer_common.py b/python/fatcat/importer_common.py deleted file mode 100644 index 8dfee875..00000000 --- a/python/fatcat/importer_common.py +++ /dev/null @@ -1,137 +0,0 @@ - -import re -import sys -import csv -import json -import itertools -import fatcat_client -from fatcat_client.rest import ApiException - -# from: https://docs.python.org/3/library/itertools.html -def grouper(iterable, n, fillvalue=None): - "Collect data into fixed-length chunks or blocks" - args = [iter(iterable)] * n - return itertools.zip_longest(*args, fillvalue=fillvalue) - -class FatcatImporter: - - def __init__(self, host_url, issn_map_file=None): - conf = fatcat_client.Configuration() - conf.host = host_url - self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) - self._issnl_id_map = dict() - self._orcid_id_map = dict() - self._doi_id_map = dict() - self._issn_issnl_map = None - self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") - if issn_map_file: - self.read_issn_map_file(issn_map_file) - self.processed_lines = 0 - self.insert_count = 0 - self.update_count = 0 - - def describe_run(self): - print("Processed {} lines, inserted {}, updated {}.".format( - self.processed_lines, self.insert_count, self.update_count)) - - def process_source(self, source, group_size=100): - """Creates and auto-accepts editgroup every group_size rows""" - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - for i, row in enumerate(source): - self.create_row(row, editgroup=eg.id) - if i > 0 and (i % group_size) == 0: - self.api.accept_editgroup(eg.id) - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - self.processed_lines = self.processed_lines + 1 - if i == 0 or (i % group_size) != 0: - self.api.accept_editgroup(eg.id) - - def process_batch(self, source, size=50): - """Reads and processes in batches (not API-call-per-)""" - for rows in grouper(source, size): - self.processed_lines = self.processed_lines + len(rows) - eg = self.api.create_editgroup( - fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) - self.create_batch(rows, editgroup=eg.id) - - def process_csv_source(self, source, group_size=100, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_source(reader, group_size) - - def process_csv_batch(self, source, size=50, delimiter=','): - reader = csv.DictReader(source, delimiter=delimiter) - self.process_batch(reader, size) - - def is_issnl(self, issnl): - return len(issnl) == 9 and issnl[4] == '-' - - def lookup_issnl(self, issnl): - """Caches calls to the ISSN-L lookup API endpoint in a local dict""" - if issnl in self._issnl_id_map: - return self._issnl_id_map[issnl] - container_id = None - try: - rv = self.api.lookup_container(issnl=issnl) - container_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._issnl_id_map[issnl] = container_id # might be None - return container_id - - def is_orcid(self, orcid): - return self._orcid_regex.match(orcid) != None - - def lookup_orcid(self, orcid): - """Caches calls to the Orcid lookup API endpoint in a local dict""" - if not self.is_orcid(orcid): - return None - if orcid in self._orcid_id_map: - return self._orcid_id_map[orcid] - creator_id = None - try: - rv = self.api.lookup_creator(orcid=orcid) - creator_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._orcid_id_map[orcid] = creator_id # might be None - return creator_id - - def is_doi(self, doi): - return doi.startswith("10.") and doi.count("/") >= 1 - - def lookup_doi(self, doi): - """Caches calls to the doi lookup API endpoint in a local dict""" - assert self.is_doi(doi) - doi = doi.lower() - if doi in self._doi_id_map: - return self._doi_id_map[doi] - release_id = None - try: - rv = self.api.lookup_release(doi=doi) - release_id = rv.ident - except ApiException as ae: - # If anything other than a 404 (not found), something is wrong - assert ae.status == 404 - self._doi_id_map[doi] = release_id # might be None - return release_id - - def read_issn_map_file(self, issn_map_file): - print("Loading ISSN map file...") - self._issn_issnl_map = dict() - for line in issn_map_file: - if line.startswith("ISSN") or len(line) == 0: - continue - (issn, issnl) = line.split()[0:2] - self._issn_issnl_map[issn] = issnl - # double mapping makes lookups easy - self._issn_issnl_map[issnl] = issnl - print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map))) - - def issn2issnl(self, issn): - if issn is None: - return None - return self._issn_issnl_map.get(issn) diff --git a/python/fatcat/issn_importer.py b/python/fatcat/issn_importer.py deleted file mode 100644 index c9ef50b5..00000000 --- a/python/fatcat/issn_importer.py +++ /dev/null @@ -1,72 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from fatcat.importer_common import FatcatImporter - -# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): -# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count - -def or_none(s): - if s is None: - return None - if len(s) == 0: - return None - return s - -def truthy(s): - if s is None: - return None - s = s.lower() - if s in ('true', 't', 'yes', 'y', '1'): - return True - elif s in ('false', 'f', 'no', 'n', '0'): - return False - else: - return None - -class FatcatIssnImporter(FatcatImporter): - - def parse_issn_row(self, row): - """ - row is a python dict (parsed from CSV). - returns a ContainerEntity - """ - title = or_none(row['title']) - issnl = or_none(row['ISSN-L']) - if title is None or issnl is None: - return - extra = dict( - in_doaj=truthy(row['in_doaj']), - in_road=truthy(row['in_road']), - in_norwegian=truthy(row['in_norwegian']), - language=or_none(row['lang']), - url=or_none(row['url']), - ISSNp=or_none(row['ISSN-print']), - ISSNe=or_none(row['ISSN-electronic']), - is_oa=truthy(row['is_oa']), - is_kept=truthy(row['is_kept']), - ) - ce = fatcat_client.ContainerEntity( - issnl=issnl, - name=title, - publisher=or_none(row['publisher']), - abbrev=None, - coden=None, - extra=extra) - return ce - - def create_row(self, row, editgroup=None): - ce = self.parse_issn_row(row) - if ce is not None: - self.api.create_container(ce, editgroup=editgroup) - self.insert_count = self.insert_count + 1 - - def create_batch(self, batch, editgroup=None): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_issn_row(l) - for l in batch if l != None] - objects = [o for o in objects if o != None] - self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) - self.insert_count = self.insert_count + len(objects) diff --git a/python/fatcat/matched_importer.py b/python/fatcat/matched_importer.py deleted file mode 100644 index 7f55369b..00000000 --- a/python/fatcat/matched_importer.py +++ /dev/null @@ -1,144 +0,0 @@ - -import sys -import json -import sqlite3 -import itertools -import fatcat_client -from fatcat.importer_common import FatcatImporter - -#row = row.split('\t') -#assert len(row) == 2 -#sha1 = row[0].replace('sha1:') -#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() -#print(sha1) -#dois = [d.lower() for d in json.loads(row[1])] - -class FatcatMatchedImporter(FatcatImporter): - """ - Input format is JSON with keys: - - dois (list) - - sha1 (hex) - - md5 (hex) - - sha256 (hex) - - size (int) - - cdx (list of objects) - - dt - - url - - mimetype - - urls (list of strings... or objects?) - - Future handlings/extensions: - - core_id, wikidata_id, pmcid, pmid: not as lists - """ - - def __init__(self, host_url, skip_file_update=False, default_mime=None, - default_link_rel="web"): - super().__init__(host_url) - self.default_mime = default_mime - self.default_link_rel = default_link_rel - self.skip_file_update = skip_file_update - - def make_url(self, raw): - rel = self.default_link_rel - # TODO: this is where we could map specific domains to rel types, - # and also filter out bad domains, invalid URLs, etc - if "//archive.org/" in raw or "//arxiv.org/" in raw: - # TODO: special-case the arxiv.org bulk mirror? - rel = "repository" - elif "//web.archive.org/" in raw or "//archive.is/" in raw: - rel = "webarchive" - return fatcat_client.FileEntityUrls(url=raw, rel=rel) - - def parse_matched_dict(self, obj): - sha1 = obj['sha1'] - dois = [d.lower() for d in obj.get('dois', [])] - - # lookup sha1, or create new entity - fe = None - if not self.skip_file_update: - try: - fe = self.api.lookup_file(sha1=sha1) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - if fe is None: - fe = fatcat_client.FileEntity( - sha1=sha1, - releases=[], - urls=[], - ) - - # lookup dois - re_list = set() - for doi in dois: - try: - re = self.api.lookup_release(doi=doi) - except fatcat_client.rest.ApiException as err: - if err.status != 404: - raise err - re = None - if re is None: - print("DOI not found: {}".format(doi)) - else: - re_list.add(re.ident) - if len(re_list) == 0: - return None - if fe.releases == set(re_list): - return None - re_list.update(fe.releases) - fe.releases = list(re_list) - - # parse URLs and CDX - existing_urls = [feu.url for feu in fe.urls] - for url in obj.get('url', []): - if url not in existing_urls: - url = self.make_url(url) - if url != None: - fe.urls.append(url) - for cdx in obj.get('cdx', []): - original = cdx['url'] - wayback = "https://web.archive.org/web/{}/{}".format( - cdx['dt'], - original) - if wayback not in existing_urls: - fe.urls.append( - fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) - if original not in existing_urls: - url = self.make_url(original) - if url != None: - fe.urls.append(url) - - if obj.get('size') != None: - fe.size = int(obj['size']) - fe.sha256 = obj.get('sha256', fe.sha256) - fe.md5 = obj.get('md5', fe.sha256) - if obj.get('mimetype') is None: - if fe.mimetype is None: - fe.mimetype = self.default_mime - else: - fe.mimetype = obj.get('mimetype') - return fe - - def create_row(self, row, editgroup=None): - obj = json.loads(row) - fe = self.parse_matched_dict(obj) - if fe is not None: - if fe.ident is None: - self.api.create_file(fe, editgroup=editgroup) - self.insert_count = self.insert_count + 1 - else: - self.api.update_file(fe.ident, fe, editgroup=editgroup) - self.update_count = self.update_count + 1 - - def create_batch(self, batch, editgroup=None): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_matched_dict(json.loads(l)) - for l in batch if l != None] - new_objects = [o for o in objects if o != None and o.ident == None] - update_objects = [o for o in objects if o != None and o.ident != None] - for obj in update_objects: - self.api.update_file(obj.ident, obj, editgroup=editgroup) - if len(new_objects) > 0: - self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup) - self.update_count = self.update_count + len(update_objects) - self.insert_count = self.insert_count + len(new_objects) diff --git a/python/fatcat/orcid_importer.py b/python/fatcat/orcid_importer.py deleted file mode 100644 index e1f5943c..00000000 --- a/python/fatcat/orcid_importer.py +++ /dev/null @@ -1,73 +0,0 @@ - -import sys -import json -import itertools -import fatcat_client -from fatcat.importer_common import FatcatImporter - -def value_or_none(e): - if type(e) == dict: - e = e.get('value') - if type(e) == str and len(e) == 0: - e = None - # TODO: this is probably bogus; patched in desperation; remove? - if e: - try: - e.encode() - except UnicodeEncodeError: - # Invalid JSON? - print("BAD UNICODE") - return None - return e - -class FatcatOrcidImporter(FatcatImporter): - - def parse_orcid_dict(self, obj): - """ - obj is a python dict (parsed from json). - returns a CreatorEntity - """ - name = obj['person']['name'] - if name is None: - return None - extra = None - given = value_or_none(name.get('given-names')) - sur = value_or_none(name.get('family-name')) - display = value_or_none(name.get('credit-name')) - if display is None: - # TODO: sorry human beings - if given and sur: - display = "{} {}".format(given, sur) - elif sur: - display = sur - elif given: - display = given - else: - # must have *some* name - return None - orcid = obj['orcid-identifier']['path'] - if not self.is_orcid(orcid): - sys.stderr.write("Bad ORCID: {}\n".format(orcid)) - return None - ce = fatcat_client.CreatorEntity( - orcid=orcid, - given_name=given, - surname=sur, - display_name=display, - extra=extra) - return ce - - def create_row(self, row, editgroup=None): - obj = json.loads(row) - ce = self.parse_orcid_dict(obj) - if ce is not None: - self.api.create_creator(ce, editgroup=editgroup) - self.insert_count = self.insert_count + 1 - - def create_batch(self, batch, editgroup=None): - """Reads and processes in batches (not API-call-per-line)""" - objects = [self.parse_orcid_dict(json.loads(l)) - for l in batch if l != None] - objects = [o for o in objects if o != None] - self.api.create_creator_batch(objects, autoaccept="true", editgroup=editgroup) - self.insert_count = self.insert_count + len(objects) diff --git a/python/fatcat/raw_api_client.py b/python/fatcat/raw_api_client.py deleted file mode 100644 index 75151ebb..00000000 --- a/python/fatcat/raw_api_client.py +++ /dev/null @@ -1,66 +0,0 @@ - -import sys -import json -import requests - - -class RawFatcatApiClient: - - def __init__(self, host_url): - self.host_url = host_url - self.session = requests.Session() - self._issn_map = dict() - - def get(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.get(self.host_url + path, json=data, - headers=headers) - - def post(self, path, data=None): - headers = {"content-type": "application/json"} - return self.session.post(self.host_url + path, json=data, - headers=headers) - - def new_editgroup(self): - rv = self.post('/v0/editgroup', data=dict( - editor_id=1)) - print(rv) - print(rv.json()) - assert rv.status_code == 201 - editgroup_id = rv.json()['id'] - return editgroup_id - - def accept_editgroup(self, eg): - rv = self.post('/v0/editgroup/{}/accept'.format(eg)) - assert rv.status_code == 200 - return rv - - def import_issn_file(self, json_file, create_containers=False, batchsize=100): - eg = self.new_editgroup() - i = 0 - with open(json_file, 'r') as file: - for line in file: - if i % batchsize == 0: - sys.stdout.write('\n{}: '.format(i)) - if (i+1) % 20 == 0: - sys.stdout.write('.') - i = i + 1 - obj = json.loads(line) - if not ("author" in obj and "title" in obj): - continue - try: - self.import_crossref_dict(obj, editgroup=eg, - create_containers=create_containers) - except Exception as e: - print("ERROR: {}".format(e)) - if i % batchsize == 0: - self.accept_editgroup(eg) - eg = self.new_editgroup() - if i % batchsize != 0: - self.accept_editgroup(eg) - print("done!") - - def health(self): - rv = self.get("/health") - assert rv.status_code == 200 - return rv.json() diff --git a/python/fatcat/routes.py b/python/fatcat/routes.py deleted file mode 100644 index ddb56abd..00000000 --- a/python/fatcat/routes.py +++ /dev/null @@ -1,364 +0,0 @@ - -import os -import json -from flask import Flask, render_template, send_from_directory, request, \ - url_for, abort, g, redirect, jsonify, session -from fatcat import app, api -from fatcat_client.rest import ApiException -from fatcat.search import do_search - - -### Views ################################################################### - -@app.route('/container//history', methods=['GET']) -def container_history(ident): - try: - entity = api.get_container(ident) - history = api.get_container_history(ident) - except ApiException as ae: - abort(ae.status) - #print(history) - return render_template('entity_history.html', - page_title=entity.name, - entity_type="container", - entity=entity, - history=history) - -@app.route('/container//edit', methods=['GET']) -def container_edit_view(ident): - try: - entity = api.get_container(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_edit.html') - -#@app.route('/container//edit', methods=['POST']) -#def container_edit(ident): -# raise NotImplemented() -# params = dict() -# for k in request.form: -# if k.startswith('container_'): -# params[k[10:]] = request.form[k] -# edit = api.update_container(params=params) -# return redirect("/container/{}".format(edit.ident)) -# # else: -# #return render_template('container_edit.html') - -@app.route('/container/create', methods=['GET']) -def container_create_view(): - return render_template('container_create.html') - -@app.route('/container/create', methods=['POST']) -def container_create(): - params = dict() - for k in request.form: - if k.startswith('container_'): - params[k[10:]] = request.form[k] - edit = api.create_container(params=params) - return redirect("/container/{}".format(edit.ident)) - -@app.route('/container/lookup', methods=['GET']) -def container_lookup(): - issnl = request.args.get('issnl') - if issnl is None: - abort(400) - try: - resp = api.lookup_container(issnl) - except ApiException as ae: - abort(ae.status) - return redirect('/container/{}'.format(resp.ident)) - -@app.route('/container/', methods=['GET']) -def container_view(ident): - try: - entity = api.get_container(ident) - except ApiException as ae: - abort(ae.status) - return render_template('container_view.html', container=entity) - -@app.route('/creator//history', methods=['GET']) -def creator_history(ident): - try: - entity = api.get_creator(ident) - history = api.get_creator_history(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_history.html', - page_title=entity.display_name, - entity_type="creator", - entity=entity, - history=history) - -@app.route('/creator//edit', methods=['GET']) -def creator_edit_view(ident): - try: - entity = api.get_creator(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_edit.html') - -@app.route('/creator/lookup', methods=['GET']) -def creator_lookup(): - orcid = request.args.get('orcid') - if orcid is None: - abort(400) - try: - resp = api.lookup_creator(orcid) - except ApiException as ae: - abort(ae.status) - return redirect('/creator/{}'.format(resp.ident)) - -@app.route('/creator/', methods=['GET']) -def creator_view(ident): - try: - entity = api.get_creator(ident) - releases = api.get_creator_releases(ident) - except ApiException as ae: - abort(ae.status) - return render_template('creator_view.html', creator=entity, releases=releases) - -@app.route('/file//history', methods=['GET']) -def file_history(ident): - try: - entity = api.get_file(ident) - history = api.get_file_history(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_history.html', - page_title=None, - entity_type="file", - entity=entity, - history=history) - -@app.route('/file//edit', methods=['GET']) -def file_edit_view(ident): - try: - entity = api.get_file(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_edit.html') - -@app.route('/file/lookup', methods=['GET']) -def file_lookup(): - sha1 = request.args.get('sha1') - if sha1 is None: - abort(400) - try: - resp = api.lookup_file(sha1) - except ApiException as ae: - abort(ae.status) - return redirect('/file/{}'.format(resp.ident)) - -@app.route('/file/', methods=['GET']) -def file_view(ident): - try: - entity = api.get_file(ident) - except ApiException as ae: - abort(ae.status) - return render_template('file_view.html', file=entity) - -@app.route('/release/lookup', methods=['GET']) -def release_lookup(): - doi = request.args.get('doi') - if doi is None: - abort(400) - try: - resp = api.lookup_release(doi) - except ApiException as ae: - abort(ae.status) - return redirect('/release/{}'.format(resp.ident)) - -@app.route('/release/create', methods=['GET']) -def release_create_view(): - return render_template('release_create.html') - -@app.route('/release/create', methods=['POST']) -def release_create(): - params = dict() - for k in request.form: - if k.startswith('release_'): - params[k[10:]] = request.form[k] - edit = api.create_release(params=params) - return redirect("/release/{}".format(edit.ident)) - -@app.route('/release//history', methods=['GET']) -def release_history(ident): - try: - entity = api.get_release(ident) - history = api.get_release_history(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_history.html', - page_title=entity.title, - entity_type="release", - entity=entity, - history=history) - -@app.route('/release//edit', methods=['GET']) -def release_edit_view(ident): - try: - entity = api.get_release(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_edit.html') - -@app.route('/release/', methods=['GET']) -def release_view(ident): - try: - entity = api.get_release(ident) - files = api.get_release_files(ident) - container = None - if entity.container_id is not None: - container = api.get_container(entity.container_id) - except ApiException as ae: - abort(ae.status) - authors = [c for c in entity.contribs if c.role in ('author', None)] - authors = sorted(authors, key=lambda c: c.index) - for fe in files: - # crudely filter out exact duplicates - kept = [] - for u in fe.urls: - if not u in kept: - kept.append(u) - fe.urls = [u for u in kept if not '/web/None/' in u.url] - return render_template('release_view.html', release=entity, - authors=authors, files=files, container=container) - -@app.route('/work/create', methods=['GET']) -def work_create_view(): - return abort(404) - -@app.route('/work//history', methods=['GET']) -def work_history(ident): - try: - entity = api.get_work(ident) - history = api.get_work_history(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_history.html', - page_title=None, - entity_type="work", - entity=entity, - history=history) - -@app.route('/work//edit', methods=['GET']) -def work_edit_view(ident): - try: - entity = api.get_work(ident) - except ApiException as ae: - abort(ae.status) - return render_template('entity_edit.html') - -@app.route('/work/', methods=['GET']) -def work_view(ident): - try: - entity = api.get_work(ident) - releases = api.get_work_releases(ident) - except ApiException as ae: - abort(ae.status) - return render_template('work_view.html', work=entity, releases=releases) - -@app.route('/editgroup/current', methods=['GET']) -def editgroup_current(): - raise NotImplemented() - #eg = api.get_or_create_editgroup() - #return redirect('/editgroup/{}'.format(eg.id)) - -@app.route('/editgroup/', methods=['GET']) -def editgroup_view(ident): - try: - entity = api.get_editgroup(str(ident)) - except ApiException as ae: - abort(ae.status) - return render_template('editgroup_view.html', editgroup=entity) - -@app.route('/editor/', methods=['GET']) -def editor_view(ident): - entity = api.get_editor(ident) - return render_template('editor_view.html', editor=entity) - -@app.route('/editor//changelog', methods=['GET']) -def editor_changelog(ident): - editor = api.get_editor(ident) - changelog_entries = api.get_editor_changelog(ident) - return render_template('editor_changelog.html', editor=editor, - changelog_entries=changelog_entries) - -@app.route('/changelog', methods=['GET']) -def changelog_view(): - try: - entries = api.get_changelog(limit=request.args.get('limit')) - except ApiException as ae: - abort(ae.status) - return render_template('changelog.html', entries=entries) - -@app.route('/changelog/', methods=['GET']) -def changelog_entry_view(index): - try: - entry = api.get_changelog_entry(int(index)) - except ApiException as ae: - abort(ae.status) - return render_template('changelog_view.html', entry=entry, editgroup=entry.editgroup) - -@app.route('/stats', methods=['GET']) -def stats_view(): - stats = api.get_stats() - return render_template('stats.html', stats=stats.extra) - -### Search ################################################################## - -@app.route('/release/search', methods=['GET', 'POST']) -def search(): - - limit = 20 - query = request.args.get('q') - fulltext_only = bool(request.args.get('fulltext_only')) - - # Convert raw DOIs to DOI queries - if query is not None: - oldquery = query.split() - for word in oldquery: - if word.startswith("10.") and word.count("/") >= 1: - query = query.replace(word, 'doi:"{}"'.format(word)) - - if 'q' in request.args.keys(): - # always do files for HTML - found = do_search(query, limit=limit, fulltext_only=fulltext_only) - return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only) - else: - return render_template('release_search.html', query=query, fulltext_only=fulltext_only) - - -### Static Routes ########################################################### - -@app.errorhandler(404) -def page_not_found(e): - return render_template('404.html'), 404 - -@app.route('/', methods=['GET']) -def homepage(): - return render_template('home.html') - -@app.route('/about', methods=['GET']) -def aboutpage(): - return render_template('about.html') - -@app.route('/search', methods=['GET']) -def search_redirect(): - return redirect("/release/search") - -@app.route('/robots.txt', methods=['GET']) -def robots(): - return send_from_directory(os.path.join(app.root_path, 'static'), - 'robots.txt', - mimetype='text/plain') - -@app.route('/static/fatcat.jpg', methods=['GET']) -def fatcat_photo(): - return send_from_directory(os.path.join(app.root_path, 'static'), - 'fatcat.jpg', - mimetype='image/jpeg') - -@app.route('/health', methods=['GET']) -def health(): - return jsonify({'ok': True}) diff --git a/python/fatcat/search.py b/python/fatcat/search.py deleted file mode 100644 index b6826110..00000000 --- a/python/fatcat/search.py +++ /dev/null @@ -1,60 +0,0 @@ - -import requests -from flask import abort -from fatcat import app - - -def do_search(q, limit=50, fulltext_only=True): - - #print("Search hit: " + q) - if limit > 100: - # Sanity check - limit = 100 - - if fulltext_only: - q += " file_in_ia:true" - - search_request = { - "query": { - "query_string": { - "query": q, - "analyzer": "textIcuSearch", - "default_operator": "AND", - "analyze_wildcard": True, - "lenient": True, - "fields": ["title^5", "contrib_names^2", "container_title"] - }, - }, - "size": int(limit), - } - - #print(search_request) - resp = requests.get("%s/%s/_search" % - (app.config['ELASTIC_BACKEND'], app.config['ELASTIC_INDEX']), - json=search_request) - - if resp.status_code != 200: - print("elasticsearch non-200 status code: " + str(resp.status_code)) - print(resp.content) - abort(resp.status_code) - - content = resp.json() - #print(content) - results = [h['_source'] for h in content['hits']['hits']] - for h in results: - # Ensure 'contrib_names' is a list, not a single string - if type(h['contrib_names']) is not list: - h['contrib_names'] = [h['contrib_names'], ] - # Handle surrogate strings that elasticsearch returns sometimes, - # probably due to mangled data processing in some pipeline. - # "Crimes against Unicode"; production workaround - for key in h: - if type(h[key]) is str: - h[key] = h[key].encode('utf8', 'ignore').decode('utf8') - h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] - - found = content['hits']['total'] - return {"query": { "q": q }, - "count_returned": len(results), - "count_found": found, - "results": results } diff --git a/python/fatcat/static/fatcat.jpg b/python/fatcat/static/fatcat.jpg deleted file mode 100644 index ad100381..00000000 Binary files a/python/fatcat/static/fatcat.jpg and /dev/null differ diff --git a/python/fatcat/static/robots.txt b/python/fatcat/static/robots.txt deleted file mode 100644 index a168f11b..00000000 --- a/python/fatcat/static/robots.txt +++ /dev/null @@ -1 +0,0 @@ -# Hello friends! diff --git a/python/fatcat/templates/404.html b/python/fatcat/templates/404.html deleted file mode 100644 index c8fbfeac..00000000 --- a/python/fatcat/templates/404.html +++ /dev/null @@ -1,6 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

404: Not Found

- -{% endblock %} diff --git a/python/fatcat/templates/about.html b/python/fatcat/templates/about.html deleted file mode 100644 index 85f100b7..00000000 --- a/python/fatcat/templates/about.html +++ /dev/null @@ -1,190 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

fatcat Design Document (RFC)

-

Contact: Bryan Newbold bnewbold@archive.org. Last updated 2018-08-10

-

fatcat is a proposed open bibliographic catalog of written works. The scope of works is somewhat flexible, with a focus on published research outputs like journal articles, pre-prints, and conference proceedings. Records are collaboratively editable, versioned, available in bulk form, and include URL-agnostic file-level metadata.

-

fatcat is currently used internally at the Internet Archive, but interested folks are welcome to contribute to design and development.

-

Goals and Ecosystem Niche

-

For the Internet Archive use case, fatcat has two primary use cases:

-
    -
  • Track the "completeness" of our holdings against all known published works. In particular, allow us to monitor and prioritize further collection work.
  • -
  • Be a public-facing catalog and access mechanism for our open access holdings.
  • -
-

In the larger ecosystem, fatcat could also provide:

-
    -
  • A work-level (as opposed to title-level) archival dashboard: what fraction of all published works are preserved in archives? KBART, CLOCKSS, Portico, and other preservations don't provide granular metadata
  • -
  • A collaborative, independent, non-commercial, fully-open, field-agnostic, "completeness"-oriented catalog of scholarly metadata
  • -
  • Unified (centralized) foundation for discovery and access across repositories and archives: discovery projects can focus on user experience instead of building their own catalog from scratch
  • -
  • Research corpus for meta-science, with an emphasis on availability and reproducibility (metadata corpus itself is open access, and file-level hashes control for content drift)
  • -
  • Foundational infrastructure for distributed digital preservation
  • -
  • On-ramp for non-traditional digital works ("grey literature") into the scholarly web
  • -
-

Technical Architecture

-

The canonical backend datastore exposes a microservice-like HTTP API, which could be extended with gRPC or GraphQL interfaces. The initial datastore is a transactional SQL database, but this implementation detail is abstracted by the API.

-

As little "application logic" as possible should be embedded in this back-end; as much as possible would be pushed to bots which could be authored and operated by anybody. A separate web interface project talks to the API backend and can be developed more rapidly with less concern about data loss or corruption.

-

A cronjob will creae periodic database dumps, both in "full" form (all tables and all edit history, removing only authentication credentials) and "flattened" form (with only the most recent version of each entity).

-

A goal is to be linked-data/RDF/JSON-LD/semantic-web "compatible", but not necessarily "first". It should be possible to export the database in a relatively clean RDF form, and to fetch data in a variety of formats, but internally fatcat will not be backed by a triple-store, and will not be bound to a rigid third-party ontology or schema.

-

Microservice daemons should be able to proxy between the primary API and standard protocols like ResourceSync and OAI-PMH, and third party bots could ingest or synchronize the databse in those formats.

-

Licensing

-

The core fatcat database should only contain verifiable factual statements (which isn't to say that all statements are "true"), not creative or derived content.

-

The goal is to have a very permissively licensed database: CC-0 (no rights reserved) if possible. Under US law, it should be possible to scrape and pull in factual data from other corpuses without adopting their licenses. The goal here isn't to avoid attribution (progeny information will be included, and a large sources and acknowledgments statement should be maintained and shipped with bulk exports), but trying to manage the intersection of all upstream source licenses seems untenable, and creates burdens for downstream users and developers.

-

Special care will need to be taken around copyright, "original work" by editors, and contributions that raise privacy concerns. If abstracts are stored at all, they should be in a partitioned database table to prevent copyright contamination. Likewise, even simple user-created content like lists, reviews, ratings, comments, discussion, documentation, etc., should live in separate services.

-

Basic Editing Workflow and Bots

-

Both human editors and bots should have edits go through the same API, with humans using either the default web interface, integrations, or client software.

-

The normal workflow is to create edits (or updates, merges, deletions) on individual entities. Individual changes are bundled into an "edit group" of related edits (eg, correcting authorship info for multiple works related to a single author). When ready, the editor would "submit" the edit group for review. During the review period, human editors vote and bots can perform automated checks. During this period the editor can make tweaks if necessary. After some fixed time period (72 hours?) with no changes and no blocking issues, the edit group would be auto-accepted if no merge conflicts have be created by other edits to the same entities. This process balances editing labor (reviews are easy, but optional) against quality (cool-down period makes it easier to detect and prevent spam or out-of-control bots). More sophisticated roles and permissions could allow some certain humans and bots to push through edits more rapidly (eg, importing new works from a publisher API).

-

Bots need to be tuned to have appropriate edit group sizes (eg, daily batches, instead of millions of works in a single edit) to make human QA review and reverts managable.

-

Data progeny and source references are captured in the edit metadata, instead of being encoded in the entity data model itself. In the case of importing external databases, the expectation is that special-purpose bot accounts are be used, and tag timestamps and external identifiers in the edit metadata. Human editors would leave edit messages to clarify their sources.

-

A style guide (wiki) and discussion forum would be hosted as separate stand-alone services for editors to propose projects and debate process or scope changes. These services should have unified accounts and logins (oauth?) to have consistent account IDs across all mediums.

-

Global Edit Changelog

-

As part of the process of "accepting" an edit group, a row would be written to an immutable, append-only log table (which internally could be a SQL table) documenting each identifier change. This changelog establishes a monotonically increasing version number for the entire corpus, and should make interaction with other systems easier (eg, search engines, replicated databases, alternative storage backends, notification frameworks, etc.).

-

Identifiers

-

A fixed number of first-class "entities" are defined, with common behavior and schema layouts. These are all be semantic entities like "work", "release", "container", and "creator".

-

fatcat identifiers are semantically meaningless fixed-length random numbers, usually represented in case-insensitive base32 format. Each entity type has its own identifier namespace.

-

128-bit (UUID size) identifiers encode as 26 characters (but note that not all such strings decode to valid UUIDs), and in the backend can be serialized in UUID columns:

-
work_rzga5b9cd7efgh04iljk8f3jvz
-https://fatcat.wiki/work/rzga5b9cd7efgh04iljk8f3jvz
-

In comparison, 96-bit identifiers would have 20 characters and look like:

-
work_rzga5b9cd7efgh04iljk
-https://fatcat.wiki/work/rzga5b9cd7efgh04iljk
-

A 64-bit namespace would probably be large enought, and would work with database Integer columns:

-
work_rzga5b9cd7efg
-https://fatcat.wiki/work/rzga5b9cd7efg
-

The idea would be to only have fatcat identifiers be used to interlink between databases, not to supplant DOIs, ISBNs, handle, ARKs, and other "registered" persistent identifiers.

-

Entities and Internal Schema

-

Internally, identifiers would be lightweight pointers to "revisions" of an entity. Revisions are stored in their complete form, not as a patch or difference; if comparing to distributed version control systems, this is the git model, not the mercurial model.

-

The entity revisions are immutable once accepted; the editting process involves the creation of new entity revisions and, if the edit is approved, pointing the identifier to the new revision. Entities cross-reference between themselves by identifier not revision number. Identifier pointers also support (versioned) deletion and redirects (for merging entities).

-

Edit objects represent a change to a single entity; edits get batched together into edit groups (like "commits" and "pull requests" in git parlance).

-

SQL tables would probably look something like the (but specific to each entity type, with tables like work_revision not entity_revision):

-
entity_ident
-    id (uuid)
-    current_revision (entity_revision foreign key)
-    redirect_id (optional; points to another entity_ident)
-
-entity_revision
-    revision_id
-    <entity-specific fields>
-    extra: json blob for schema evolution
-
-entity_edit
-    timestamp
-    editgroup_id
-    ident (entity_ident foreign key)
-    new_revision (entity_revision foreign key)
-    previous_revision (optional; points to entity_revision)
-    extra: json blob for progeny metadata
-
-editgroup
-    editor_id
-    description
-    extra: json blob for progeny metadata
-

Additional entity-specific columns would hold actual metadata. Additional tables (which would reference both entity_revision and entity_id foreign keys as appropriate) would represent things like authorship relationships (creator/release), citations between works, etc. Every revision of an entity would require duplicating all of these associated rows, which could end up being a large source of inefficiency, but is necessary to represent the full history of an object.

-

Scope

-

The goal is to capture the "scholarly web": the graph of written works that cite other works. Any work that is both cited more than once and cites more than one other work in the catalog is very likely to be in scope. "Leaf nodes" and small islands of intra-cited works may or may not be in scope.

-

Overall focus is on written works, with some exceptions. The expected core focus (for which we would pursue "completeness") is:

-
journal articles
-academic books
-conference proceedings
-technical memos
-dissertations
-monographs
-well-researched blog posts
-web pages (that have citations)
-"white papers"
-

Possibly in scope:

-
reports
-magazine articles
-essays
-notable mailing list postings
-government documents
-presentations (slides, video)
-datasets
-well-researched wiki pages
-patents
-

Probably not:

-
court cases and legal documents
-newspaper articles
-social media
-manuals
-datasheets
-courses
-published poetry
-

Definitely not:

-
audio recordings
-tv show episodes
-musical scores
-advertisements
-

Author, citation, and work disambiguation would be core tasks. Linking pre-prints to final publication is in scope.

-

I'm much less interested in altmetrics, funding, and grant relationships than most existing databases in this space.

-

fatcat would not include any fulltext content itself, even for cleanly licensed (open access) works, but would have "strong" (verified) links to fulltext content, and would include file-level metadata (like hashes and fingerprints) to help discovery and identify content from any source. File-level URLs with context ("repository", "author-homepage", "web-archive") should make fatcat more useful for both humans and machines to quickly access fulltext content of a given mimetype than existing redirect or landing page systems. So another factor in deciding scope is whether a work has "digital fixity" and can be contained in a single immutable file.

-

Ontology

-

Loosely following FRBR (Functional Requirements for Bibliographic Records), but removing the "manifestation" abstraction, and favoring files (digital artifacts) over physical items, the primary entities are:

-
work
-    <a stub, for grouping releases>
-
-release (aka "edition", "variant")
-    title
-    volume/pages/issue/chapter
-    media/formfactor
-    publication/peer-review status
-    language
-    <published> date
-    <variant-of> work
-    <published-in> container
-    <has-contributors> creator
-    <citation-to> release
-    <has> identifier
-
-file (aka "digital artifact")
-    <instantiates> release
-    hashes/checksums
-    mimetype
-    <found-at> URLs
-
-creator (aka "author")
-    name
-    identifiers
-    aliases
-
-container (aka "venue", "serial", "title")
-    name
-    open-access policy
-    peer-review policy
-    <has> aliases, acronyms
-    <about> subject/category
-    <has> identifier
-    <published-in> container
-    <published-by> publisher
-

Controlled Vocabularies

-

Some special namespace tables and enums would probably be helpful; these could live in the database (not requiring a database migration to update), but should have more controlled editing workflow... perhaps versioned in the codebase:

-
    -
  • identifier namespaces (DOI, ISBN, ISSN, ORCID, etc; but not the identifers themselves)
  • -
  • subject categorization
  • -
  • license and open access status
  • -
  • work "types" (article vs. book chapter vs. proceeding, etc)
  • -
  • contributor types (author, translator, illustrator, etc)
  • -
  • human languages
  • -
  • file mimetypes
  • -
-

These could also be enforced by QA bots that review all editgroups.

-

Unresolved Questions

-

How to handle translations of, eg, titles and author names? To be clear, not translations of works (which are just separate releases), these are more like aliases or "originally known as".

-

Are bi-directional links a schema anti-pattern? Eg, should "work" point to a "primary release" (which itself points back to the work)?

-

Should identifier and citation be their own entities, referencing other entities by UUID instead of by revision? Not sure if this would increase or decrease database resource utilization.

-

Should contributor/author affiliation and contact information be retained? It could be very useful for disambiguation, but we don't want to build a huge database for spammers or "innovative" start-up marketing.

-

Can general-purpose SQL databases like Postgres or MySQL scale well enough to hold several tables with billions of entity revisions? Right from the start there are hundreds of millions of works and releases, many of which having dozens of citations, many authors, and many identifiers, and then we'll have potentially dozens of edits for each of these, which multiply out to 1e8 * 2e1 * 2e1 = 4e10, or 40 billion rows in the citation table. If each row was 32 bytes on average (uncompressed, not including index size), that would be 1.3 TByte on its own, larger than common SSD disks. I do think a transactional SQL datastore is the right answer. In my experience locking and index rebuild times are usually the biggest scaling challenges; the largely-immutable architecture here should mitigate locking. Hopefully few indexes would be needed in the primary database, as user interfaces could rely on secondary read-only search engines for more complex queries and views.

-

I see a tension between focus and scope creep. If a central database like fatcat doesn't support enough fields and metadata, then it will not be possible to completely import other corpuses, and this becomes "yet another" partial bibliographic database. On the other hand, accepting arbitrary data leads to other problems: sparseness increases (we have more "partial" data), potential for redundancy is high, humans will start editing content that might be bulk-replaced, etc.

-

There might be a need to support "stub" references between entities. Eg, when adding citations from PDF extraction, the cited works are likely to be ambiguous. Could create "stub" works to be merged/resolved later, or could leave the citation hanging. Same with authors, containers (journals), etc.

-

References and Previous Work

-

The closest overall analog of fatcat is MusicBrainz, a collaboratively edited music database. Open Library is a very similar existing service, which exclusively contains book metadata.

-

Wikidata seems to be the most successful and actively edited/developed open bibliographic database at this time (early 2018), including the wikicite conference and related Wikimedia/Wikipedia projects. Wikidata is a general purpose semantic database of entities, facts, and relationships; bibliographic metadata has become a large fraction of all content in recent years. The focus there seems to be linking knowledge (statements) to specific sources unambiguously. Potential advantages fatcat would have would be a focus on a specific scope (not a general-purpose database of entities) and a goal of completeness (capturing as many works and relationships as rapidly as possible). However, it might be better to just pitch in to the wikidata efforts.

-

The technical design of fatcat is loosely inspired by the git branch/tag/commit/tree architecture, and specifically inspired by Oliver Charles' "New Edit System" blog posts from 2012.

-

There are a whole bunch of proprietary, for-profit bibliographic databases, including Web of Science, Google Scholar, Microsoft Academic Graph, aminer, Scopus, and Dimensions. There are excellent field-limited databases like dblp, MEDLINE, and Semantic Scholar. There are some large general-purpose databases that are not directly user-editable, including the OpenCitation corpus, CORE, BASE, and CrossRef. I don't know of any large (more than 60 million works), open (bulk-downloadable with permissive or no license), field agnostic, user-editable corpus of scholarly publication bibliographic metadata.

-

RFC Changelog

-
    -
  • 2017-12-16: early notes
  • -
  • 2018-01-17: initial RFC document
  • -
  • 2018-08-10: updates from implementation work
  • -
- -{% endblock %} diff --git a/python/fatcat/templates/base.html b/python/fatcat/templates/base.html deleted file mode 100644 index 856a6e03..00000000 --- a/python/fatcat/templates/base.html +++ /dev/null @@ -1,78 +0,0 @@ - - - - - - - {% block title %}fatcat!{% endblock %} - - - - - - - - - - - - -
-{% block fullbody %} -
- {% block body %}Nothing to see here.{% endblock %} -
-{% endblock %} -
- - - - -{% block postscript %}{% endblock %} - - - diff --git a/python/fatcat/templates/changelog.html b/python/fatcat/templates/changelog.html deleted file mode 100644 index f33fe7c8..00000000 --- a/python/fatcat/templates/changelog.html +++ /dev/null @@ -1,25 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Recent Changes -
changelog

- -Limited to the most recent ~50 entries. - - - - {% for entry in entries %} -
Changelog
Index -
Timestamp (UTC) - Editgroup - Editor - Description -
{{ entry.index }} - {{ entry.timestamp }} - {{ entry.editgroup_id }} - {{ entry.editgroup.editor_id }} - {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} - {% endfor %} -
- -{% endblock %} diff --git a/python/fatcat/templates/changelog_view.html b/python/fatcat/templates/changelog_view.html deleted file mode 100644 index 22aff9bc..00000000 --- a/python/fatcat/templates/changelog_view.html +++ /dev/null @@ -1,13 +0,0 @@ -{% extends "editgroup_view.html" %} -{% block editgroupheader %} - -

Changelog Entry -
- changelog {{ entry.index }} -
-

- -
Timestamp: {{ entry.timestamp }} -
Editgroup: {{ editgroup.id }} - -{% endblock %} diff --git a/python/fatcat/templates/container_create.html b/python/fatcat/templates/container_create.html deleted file mode 100644 index 15288142..00000000 --- a/python/fatcat/templates/container_create.html +++ /dev/null @@ -1,168 +0,0 @@ -{% extends "base.html" %} -{% block body %} -
-

Adding a New Container

- -

A "container" is a anything that groups publications together. For example, -a journal (eg, "New England Journal of Medicine"), conference proceedings, a -book series, or a blog. - -

Not all publications are in a container. - -

- -

The Basics

- -
- - -
- -
- - -
- - -
- - -
- - -
- - -
- - -
- - -
- - -
- - -
- - - - - - -

Anything Else?

- -
Create container
- -

Entity will be created as part of the current edit group, which needs to be -submited and approved before the entity will formally be included in the -catalog. - -

- -
-{% endblock %} - -{% block postscript %} - -{% endblock %} diff --git a/python/fatcat/templates/container_view.html b/python/fatcat/templates/container_view.html deleted file mode 100644 index c2ca7327..00000000 --- a/python/fatcat/templates/container_view.html +++ /dev/null @@ -1,108 +0,0 @@ -{% extends "base.html" %} -{% block fullbody %} - -
-
-
-

{{ container.name }} -
container {{ container.ident }}

-
-
- -
-
-
- -

Publisher: -{% if container.publisher != None %}{{ container.publisher }}{% else %}Unknown{% endif %} -{% if container.coden != None %} -
CODEN?:  {{ container.coden }} -{% endif %} -{% if container.abbrev != None %} -
Abbrev.:  {{ container.abbrev }} -{% endif %} -{% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %} -
Homepage:  {{ container.extra['url'] }} -{% endif %} -{% if container.wikidata_qid != None %} -
Wikidata Entity:  {{ container.wikidata_qid }} -{% endif %} - -{% if container.extra != None %} -

Extra Metadata (raw JSON)

-{% for (key, value) in container.extra.items() %} -{{ key }}: {{ value }}
-{% endfor %} -{% endif %} - - - -
-
-
- -{% if container.extra.is_oa == True %} -Open Access Publisher -{% elif container.extra.is_oa == False %} -Not Open Access -{% else %} -Unknown OA Status -{% endif %} -
- -{% if container.issnl != None %} - ISSN-L? -  {{ container.issnl }} - {% if container.extra != None and (container.extra.ISSNp|length > 0) %} -
Print:  {{ container.extra.ISSNp }} - {% endif %} - {% if container.extra != None and (container.extra.ISSNe|length > 0) %} -
Electronic:  {{ container.extra.ISSNe }} - {% endif %} -
-{% endif %} - - -Directory Listings
-{% if (container.extra != None) %} - {% if container.extra.in_doaj == True %} - In DOAJ
- {% elif container.extra.in_doaj == False %} - Not in DOAJ
- {% endif %} - {% if container.extra.in_road == True %} - In ISSN ROAD
- {% elif container.extra.in_road == False %} - Not in ISSN ROAD
- {% endif %} - {% if container.extra.is_kept == True %} - In Keepers Registery
- {% elif container.extra.is_kept == False %} - Not in Keepers Registry
- {% endif %} -{% endif %} -
- -Lookup Links -
SHERPA/RoMEO (access policies) -
wikidata.org -
- -Fatcat Bits -

State is "{{ container.state }}". Revision: -
{{ container.revision }} -
As JSON object via API - -

- - -
-
- -{% endblock %} diff --git a/python/fatcat/templates/creator_view.html b/python/fatcat/templates/creator_view.html deleted file mode 100644 index 2ce01fb6..00000000 --- a/python/fatcat/templates/creator_view.html +++ /dev/null @@ -1,82 +0,0 @@ -{% extends "base.html" %} -{% block fullbody %} - -
-
-
-

{{ creator.display_name }} -
creator {{ creator.ident }}

-
-
- -
-
-
- -

Given ("first") name: - {% if creator.given_name != None %}{{ creator.given_name}}{% else %}None or unknown{% endif %} -

Sur ("family"/"last") name: - {% if creator.surname != None %}{{ creator.surname }}{% else %}None or unknown{% endif %} - -{% if creator.wikidata_qid != None %} -
Wikidata Entity:  {{ creator.wikidata_qid }} -{% endif %} -{% if creator.extra != None %} -

Extra Metadata (raw JSON)

-{% for (key, value) in creator.extra.items() %} -{{ key }}: {{ value }}
-{% endfor %} -{% endif %} - -
-

Releases

-{% if releases != [] %} -

This creator has contributed to: -

    - {% for release in releases %} -
  • "{{ release.title }}", a {{ release.release_type }} published {{ release.release_date }} - {% if release.release_status != None %}(status: {{ release.release_status }}){% endif %}. -
    Fatcat ID: {{ release.ident }} - {% endfor %} -
-{% else %} -This creator has not contributed to any releases. -{% endif %} - - - -
-
-
- -{% if creator.orcid != None %} - ORCID?: -  {{ creator.orcid }} -
-{% endif %} - -Lookup Links -
wikidata.org -
VIAF -
dblp (CS) -
Google Scholar -
- -Fatcat Bits -

State is "{{ creator.state }}". Revision: -
{{ creator.revision }} -
As JSON object via API - -

- - -
-
- -{% endblock %} diff --git a/python/fatcat/templates/editgroup_view.html b/python/fatcat/templates/editgroup_view.html deleted file mode 100644 index ac3228b0..00000000 --- a/python/fatcat/templates/editgroup_view.html +++ /dev/null @@ -1,54 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -{# extended by changelog_entry #} -{% block editgroupheader %} -

Edit Group -
editgroup {{ editgroup.id }}

-{% endblock %} - -{# TODO:

Editor: {{ editgroup.editor.username }} #} -
Editor: {{ editgroup.editor_id }} -
Description: {{ editgroup.description }} - -

Work Edits ({{ editgroup.edits.works|count }})

- - -

Release Edits ({{ editgroup.edits.releases|count }})

- - -

Container Edits ({{ editgroup.edits.containers|count }})

- - -

Creator Edits ({{ editgroup.edits.creators|count }})

- - -

File Edits ({{ editgroup.edits.files|count }})

- - -{% endblock %} diff --git a/python/fatcat/templates/editor_changelog.html b/python/fatcat/templates/editor_changelog.html deleted file mode 100644 index 79127312..00000000 --- a/python/fatcat/templates/editor_changelog.html +++ /dev/null @@ -1,29 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Editor Changelog: {{ editor.username }} - -

- -

Changes accepted (aka, merged editgroups): - - - {% for entry in changelog_entries %} -
Changelog
Index -
Timestamp (UTC) - Editgroup - Editor - Description -
{{ entry.index }} - {{ entry.timestamp }} - {{ entry.editgroup_id }} - {{ entry.editgroup.editor_id }} - {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} - {% endfor %} -
- -{% endblock %} diff --git a/python/fatcat/templates/editor_view.html b/python/fatcat/templates/editor_view.html deleted file mode 100644 index c9b61f5d..00000000 --- a/python/fatcat/templates/editor_view.html +++ /dev/null @@ -1,12 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

{{ editor.username }} -
- editor {{ editor.id }} -
-

- -

View editor's changelog - -{% endblock %} diff --git a/python/fatcat/templates/entity_edit.html b/python/fatcat/templates/entity_edit.html deleted file mode 100644 index 5da98d89..00000000 --- a/python/fatcat/templates/entity_edit.html +++ /dev/null @@ -1,8 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Not Implemented

- -Entity editing isn't implemented yet, only creation. Sorry! - -{% endblock %} diff --git a/python/fatcat/templates/entity_history.html b/python/fatcat/templates/entity_history.html deleted file mode 100644 index 54577b2f..00000000 --- a/python/fatcat/templates/entity_history.html +++ /dev/null @@ -1,30 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

{% if page_title != None %}{{ page_title }}{% endif %} - -

- -

Fatcat Metadata Edit History

- - - - {% for entry in history %} -
Changelog
Index -
Timestamp (UTC) - Editgroup - Editor - Description -
{{ entry.changelog_entry.index }} - {{ entry.changelog_entry.timestamp }} - {{ entry.editgroup.id }} - {{ entry.editgroup.editor_id }} - {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} - {% endfor %} -
- -{% endblock %} diff --git a/python/fatcat/templates/file_view.html b/python/fatcat/templates/file_view.html deleted file mode 100644 index 74977668..00000000 --- a/python/fatcat/templates/file_view.html +++ /dev/null @@ -1,108 +0,0 @@ -{% extends "base.html" %} -{% block fullbody %} - -
-
-
-

-
file {{ file.ident }}

-
-
- -
-
-
- -{% if file.extra != None %} -

Extra Metadata (raw JSON)

-{% for (key, value) in file.extra.items() %} -{{ key }}: {{ value }}
-{% endfor %} -{% endif %} - -

Releases

-{% if file.releases != None %} -

Releases associated with this file: -

-{% else %} -This file is not associated with any fatcat release. -{% endif %} - -

URLs

-{% if file.url != None %} -

Known locations of this file: -

-{% else %} -No known public URL, mirror, or archive for this file. -{% endif %} - -

Checksums

- - - - {% if file.sha1 != None %} -
Algorithm - Value -
SHA-1 - {{ file.sha1 }} - {% endif %} - {% if file.sha256 != None %} -
SHA-256 - {{ file.sha256 }} - {% endif %} - {% if file.md5!= None %} -
MD5 - {{ file.md5 }} - {% endif %} -
- - - -
-
- -{% if file.urls != None and file.urls != [] %} -Download File -{% else %} -No Download Available -{% endif %} - -
- -{% if file.size != None %} -

Size  {{ file.size }} (bytes) -

-{% endif %} - -{% if file.mimetype != None %} -

File Type  {{ file.mimetype }} -

-{% endif %} - -Fatcat Bits -

State is "{{ file.state }}". Revision: -
{{ file.revision }} -
As JSON object via API - -

- - -
-
- - -{% endblock %} diff --git a/python/fatcat/templates/home.html b/python/fatcat/templates/home.html deleted file mode 100644 index 4d3b44a1..00000000 --- a/python/fatcat/templates/home.html +++ /dev/null @@ -1,91 +0,0 @@ -{% extends "base.html" %} -{% block body %} -
- -

Welcome to fatcat!

- - - -
-
Current Status: Prototype
-
    -
  • No authentication or accounts -
  • Any edits will be lost -
  • Most creation/edit forms don't work -
  • Any data was bulk-imported, and may not be up to date -
  • Search results are from Crossref, not local API/database -
-
- -

This is versioned, user-editable catalog of research publications: journal -articles, conference proceedings, pre-prints, etc. Features include archival -file-level metadata (verified digests and long-term copies, in addition to -URLs), a documented API, and work/release -indexing (aka, linking together of pre-prints and final copies). -Read more... - -

- - - - -
Entity - Actions - Examples - Lookup -
Release -
journal article, pre-print, book -
published version of a Work -
Create - Dummy -
Realistic -
-
- - -
-
-
Container -
journal or serial -
Create - Dummy -
Realistic -
-
- - -
-
- -
Creator -
authors, editors, translators -
- Dummy -
Realistic -
-
- - -
-
-
File -
specific digital blobs (immutable) -
- Dummy -
Realistic -
-
- - -
-
-
Work -
for grouping Releases -
- Dummy -
Realistic -
-
- -

-{% endblock %} diff --git a/python/fatcat/templates/release_changelog.html b/python/fatcat/templates/release_changelog.html deleted file mode 100644 index 706a5642..00000000 --- a/python/fatcat/templates/release_changelog.html +++ /dev/null @@ -1,17 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Release Changelog: {{ release.id }}

- -

release: {{ release.id }} - -

Changelog: -

- -{% endblock %} diff --git a/python/fatcat/templates/release_create.html b/python/fatcat/templates/release_create.html deleted file mode 100644 index ac8a8169..00000000 --- a/python/fatcat/templates/release_create.html +++ /dev/null @@ -1,215 +0,0 @@ -{% extends "base.html" %} -{% block body %} -
-

Adding a New Thing

- -
- -

The Basics

- -
- - -
- -
- - -
- - - - - -
- -
- - -
-
- - -
- - -
- - -
- - -
- - -

Primary Release / Edition

- - - - - - - - - - - - - -

Anything Else?

- - - - -
Create Work
-
- -
-{% endblock %} - -{% block postscript %} - -{% endblock %} diff --git a/python/fatcat/templates/release_search.html b/python/fatcat/templates/release_search.html deleted file mode 100644 index c57ad149..00000000 --- a/python/fatcat/templates/release_search.html +++ /dev/null @@ -1,64 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Article Search

-
-
-
- - -
-
- - -
-
-
- -
- -{% if found %} -{% if found.results %} - Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: {{ found.query.q }} -{% for paper in found.results %} -
-

{{ paper['title'] }} - {% if paper.file_pdf_url %} -   fulltext - {% endif %} -

-
{{ ", ".join(paper.contrib_names) }}
- {% if paper.doi %} - DOI: {{ paper.doi }} - - {{ paper.release_type }} - {% if paper.release_date %} - - {{ paper.release_date[:4] }} - {% endif %} -
- {% endif %} - {% if paper.container_name %} - {% if paper.container_issnl %} - {{ paper.container_name }} - {% else %} - {{ paper.container_name }} - {% endif %} - {% if paper.container_is_oa %}{% endif %} - {% endif %} -
-{% endfor %} -{% else %} -
-

No results found!

- Query was: {{ found.query.q }} -
-

Try:

- -
-{% endif %} -{% endif %} - -{% endblock %} diff --git a/python/fatcat/templates/release_view.html b/python/fatcat/templates/release_view.html deleted file mode 100644 index 39dcf8fd..00000000 --- a/python/fatcat/templates/release_view.html +++ /dev/null @@ -1,290 +0,0 @@ -{% extends "base.html" %} -{% block fullbody %} - -
-
-
-

{{ release.title }} -
release {{ release.ident }}

-

- {% if authors != [] %} by {% endif %} - {% for contrib in authors %} - {% if contrib.creator_id %} - {{ contrib.raw_name }}{% if not loop.last %}, {% endif %} - {% else %} - {% if contrib.raw_name != None %}{{ contrib.raw_name }}{% else %}Unknown{% endif %}{% if not loop.last %}, {% endif %} - {% endif %} - {% endfor %} -

-
- -
-
-
- -{% if release.release_date != None %}

Date (published): {{ release.release_date }}{% endif %} -{% if release.pmid != None %} -
PubMed:  {{ release.pmid }} -{% endif %} -{% if release.pmcid != None %} -
PubMed Central:  {{ release.pmcid }} -{% endif %} -{% if release.wikidata_qid != None %} -
Wikidata Entity:  {{ release.wikidata_qid }} -{% endif %} -{% if release.language != None %} -
Primary Language:  {{ release.language }} (lookup ISO-639 code) -{% endif %} -
This {{ release.release_type or "unknown-type" }} is a release -(version) of the work  {{ -release.work_id }}. There may be other releases (eg, pre-prints, -formal publications, etc) linked to the same work. - -{% if container != None %} -

-
- - {% if release.release_status == 'published' %} - Published in {{ container.name }} - {% else %} - Released in {{ release.release_type }} in {{ container.name }} - {% endif %} - {% if release.publisher %} - by {{ release.publisher }} - {% endif %} -
- {% if container != None and container.issnl != None %}ISSN-L: {{ container.issnl }}
{% endif %} - {% if release.volume != None %}Volume: {{ release.volume }}
{% endif %} - {% if release.issue != None %}Issue: {{ release.issue }}
{% endif %} - {% if release.pages != None %}Page(s): {{ release.pages }}
{% endif %} - {% if release.publisher != None %}Publisher: {{ release.publisher }}
{% endif %} - {% if release.release_status != None %}Release Status: {{ release.release_status }}
{% endif %} - {% if release.release_type != None %}Release Type: {{ release.release_type}}
{% endif %} -
-
-{% endif %} - -{% if release.extra != None %} -

Extra Metadata (raw JSON)

-{% for (key, value) in release.extra.items() %} -{{ key }}: {% if key == "crossref" %} <truncated, see full JSON>{% else %} {{ value }} {% endif %}
-{% endfor %} -{% endif %} - - - -
-

Abstracts

-{% if release.abstracts != [] %} - {% for abstract in release.abstracts %} - Abstract ({{ abstract.sha1 }}, {{ abstract.mimetype }}): {{ abstract.content }} - {% endfor %} -{% else %} -

No known abstracts. -{% endif %} - -
-

All Contributors

-{% if release.contribs.size != 0 %} - - - - - {% for contrib in release.contribs %} - -
Attribution Order - Name - Role -
{% if contrib.index or contrib.index == 0 %} {{ contrib.index + 1 }}{% endif %} - {% if contrib.creator_id %} - {{ contrib.raw_name }} - {% else %} - {{ contrib.raw_name }} - {% endif %} - {{ contrib.role or '' }} - {% endfor %} -
-{% else %} -

Contributors (authors, translators, etc) not known. -{% endif %} - -
-

Known Files and URLs

-{% if files != [] %} - - - - - {% for file in files %} - -
SHA-1 - Size (bytes) - File Type - Links -
{{ file.sha1[:16] + "..." }} - {% if file.size != None %}{{ file.size }}{% endif %} - {% if file.mimetype != None %}{{ file.mimetype }}{% endif %} - {% for url in file.urls %} - {{ url.url.split('/')[2] }} ({{ url.rel }})
- {% endfor %} - {% endfor %} -
- -{% else %} -

There are no known files associated with this release (you could try -other releases for this work?). -{% endif %} - -
-{% if release.refs.size != 0 %} -

References

-This release citing other releases. -
    - {% for ref in release.refs %} -
  1. - {% if ref.title %} - {{ ref.title }} - {% if ref.container_name %}{{ ref.container_name }}.{% endif %} - {% if ref.year %}{{ ref.year }}{% endif %} - {% if ref.locator %}{{ ref.locator }}{% endif %} - {% elif ref.extra and ref.extra.crossref %} - {% if ref.extra.crossref.get('author') %}{{ ref.extra.crossref['author'] }}.{% endif %} - {% if ref.extra.crossref.get('article-title') %}{{ ref.extra.crossref['article-title'] }}.{% endif %} - {% if ref.container_name %}{{ ref.container_name }}.{% endif %} - {% if ref.year %}{{ ref.year }}.{% endif %} - {% elif ref.extra and ref.extra.unstructured %} - {{ ref.extra.unstructured }} - {% else %} - unknown - {% endif %} - {% if ref.target_release_id != None %} - (fatcat release) -{# {% elif ref.extra != None and ref.extra.doi != None %} - (DOI: {{ ref.extra.get('doi') }}) #} - {% endif %} - {% endfor %} -
-{% else %} -

No reference list available. -{% endif %} - -

-
- -{% if files != [] and files[0].urls != [] %} -Download Full Text -{% else %} -No Full Text Available -{% endif %} - -{% if release.release_type != None %} -
-Release Type {{ release.release_type }} -
-{% endif %} - -{% if release.doi %} - -{% endif %} -{% if release.isbn13 != None %} -
-

ISBN-13  {{ release.isbn13 }} -

-{% endif %} - -{% if release.extra.is_oa == True %} -
- Open Access -
-{% elif release.extra.is_oa == False %} - Not Open Access -
-{% endif %} - -{% if container != None %} -
-Container Metadata
-{% if container.extra.is_oa == True %} -Open Access Publication
-{% elif container.extra.is_oa == False %} -Not Open Access
-{% else %} -Unknown OA Status
-{% endif %} -{% if (container.extra != None) %} - {% if container.extra.in_doaj == True %} - In DOAJ
- {% elif container.extra.in_doaj == False %} - Not in DOAJ
- {% endif %} - {% if container.extra.in_road == True %} - In ISSN ROAD
- {% elif container.extra.in_road == False %} - Not in ISSN ROAD
- {% endif %} - {% if container.extra.is_kept == True %} - In Keepers Registery
- {% elif container.extra.is_kept == False %} - Not in Keepers Registry
- {% endif %} -{% endif %} -{% if container.issnl != None %} - ISSN-L:  {{ container.issnl }}
-{% endif %} - Fatcat:  {{ container.ident }}
-
-{% endif %} - -
-
Lookup Links
-
-{% if container != None and container.issnl != None %} - SHERPA/RoMEO (journal policies)
-{% endif %} -{% if container != None and container.doi != None %} - oaDOI/unpaywall
-{% endif %} -{% if release.isbn13 != None %} - Open Library
- Worldcat
-{% else %} - Worldcat
-{% endif %} -{% if release.doi %} -Crossref Metadata (via API)
-{% endif %} -wikidata.org
-CORE.ac.uk
-Semantic Scholar (CS, neuro)
-Google Scholar
-
-
- -
-Fatcat Bits -

State is "{{ release.state }}". Revision: -
{{ release.revision }} -
As JSON object via API - -

- - -
-
-{% endblock %} - -{% block postscript %} - -{% endblock %} diff --git a/python/fatcat/templates/stats.html b/python/fatcat/templates/stats.html deleted file mode 100644 index 6a37dcee..00000000 --- a/python/fatcat/templates/stats.html +++ /dev/null @@ -1,104 +0,0 @@ -{% extends "base.html" %} -{% block body %} - -

Entity Statistics

- -
-
- {{ stats.entity_counts.work }} -
-
- Works -
-
- -
- -
-
- {{ stats.entity_counts.release }} -
-
- Releases -
-
- -
-
- {{ stats.releases_with_dois }} -
-
- ... with DOIs -
-
- -
-
- {{ stats.releases_with_dois }} -
-
- ... with a File -
-
- -
- -
-
- {{ stats.entity_counts.container }} -
-
- Containers -
-
- -
-
- {{ stats.containers_with_issnls }} -
-
- ... with an ISSN-L -
-
- -
- -
-
- {{ stats.entity_counts.creator }} -
-
- Creators -
-
- -
-
- {{ stats.creators_with_orcids }} -
-
- ... with an ORCID -
-
- -
- -
-
- {{ stats.entity_counts.file }} -
-
- Files -
-
- -
-
- {{ stats.files_with_releases }} -
-
- ... with a Release -
-
- -{% endblock %} diff --git a/python/fatcat/templates/work_view.html b/python/fatcat/templates/work_view.html deleted file mode 100644 index 87120e63..00000000 --- a/python/fatcat/templates/work_view.html +++ /dev/null @@ -1,72 +0,0 @@ -{% extends "base.html" %} -{% block fullbody %} - -
-
-
-

-
work {{ work.ident }}

-
-
- -
-
-
- -{% if work.extra != None %} -

Extra Metadata (raw JSON)

-{% for (key, value) in work.extra.items() %} -{{ key }}: {{ value }}
-{% endfor %} -{% endif %} - - - -

A "work" is just a linking identifier between a set of releases. For -example, a pre-print and a published article may contain small differences, but -still reference the same underlying "work". - -
- -{% if releases != [] %} -

    - {% for release in releases %} -
  • "{{ release.title }}", a {{ release.release_type }} published {{ release.release_date }} as {{ release.release_status }}. -
    {{ release.ident }} - {% endfor %} -
-{% else %} -

There are no known releases associated with this work. -{% endif %} - - -

-
-
- -Work Type: -{% if work.work_type != None %} - {{ work.work_type }} -{% else %} -unkonwn -{% endif %} -
- -Fatcat Bits -

State is "{{ work.state }}". Revision: -
{{ work.revision }} -
As JSON object via API - -

- - -
-
- -{% endblock %} diff --git a/python/fatcat/worker_common.py b/python/fatcat/worker_common.py deleted file mode 100644 index 77ea2c15..00000000 --- a/python/fatcat/worker_common.py +++ /dev/null @@ -1,25 +0,0 @@ - -import re -import sys -import csv -import json -import itertools -import fatcat_client -from pykafka import KafkaClient -from fatcat_client.rest import ApiException - - -class FatcatWorker: - """ - Common code for for Kafka producers and consumers. - """ - - def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None): - if api_host_url: - conf = fatcat_client.Configuration() - conf.host = api_host_url - self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) - self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") - self.produce_topic = produce_topic - self.consume_topic = consume_topic - diff --git a/python/fatcat_tools/changelog_workers.py b/python/fatcat_tools/changelog_workers.py new file mode 100644 index 00000000..e341ea32 --- /dev/null +++ b/python/fatcat_tools/changelog_workers.py @@ -0,0 +1,122 @@ + +import json +import time +from itertools import islice +from fatcat.worker_common import FatcatWorker +from pykafka.common import OffsetType + + +class FatcatChangelogWorker(FatcatWorker): + """ + Periodically polls the fatcat API looking for new changelogs. When they are + found, fetch them and push (as JSON) into a Kafka topic. + """ + + def __init__(self, api_host_url, kafka_hosts, produce_topic, poll_interval=10.0, offset=None): + # TODO: should be offset=0 + super().__init__(kafka_hosts=kafka_hosts, + produce_topic=produce_topic, + api_host_url=api_host_url) + self.poll_interval = poll_interval + self.offset = offset # the fatcat changelog offset, not the kafka offset + + def most_recent_message(self, topic): + """ + Tries to fetch the most recent message from a given topic. + This only makes sense for single partition topics, though could be + extended with "last N" behavior. + + Following "Consuming the last N messages from a topic" + from https://pykafka.readthedocs.io/en/latest/usage.html#consumer-patterns + """ + consumer = topic.get_simple_consumer( + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=True) + offsets = [(p, op.last_offset_consumed - 1) + for p, op in consumer._partitions.items()] + offsets = [(p, (o if o > -1 else -2)) for p, o in offsets] + if -2 in [o for p, o in offsets]: + return None + else: + consumer.reset_offsets(offsets) + msg = islice(consumer, 1) + if msg: + return list(msg)[0].value + else: + return None + + def run(self): + topic = self.kafka.topics[self.produce_topic] + # On start, try to consume the most recent from the topic, and using + # that as the starting offset. Note that this is a single-partition + # topic + if self.offset is None: + print("Checking for most recent changelog offset...") + msg = self.most_recent_message(topic) + if msg: + self.offset = json.loads(msg.decode('utf-8'))['index'] + else: + self.offset = 1 + + with topic.get_sync_producer() as producer: + while True: + latest = int(self.api.get_changelog(limit=1)[0].index) + if latest > self.offset: + print("Fetching changelogs from {} through {}".format( + self.offset+1, latest)) + for i in range(self.offset+1, latest+1): + cle = self.api.get_changelog_entry(i) + obj = self.api.api_client.sanitize_for_serialization(cle) + producer.produce( + message=json.dumps(obj).encode('utf-8'), + partition_key=None, + timestamp=None, + #XXX: timestamp=cle.timestamp, + ) + self.offset = i + print("Sleeping {} seconds...".format(self.poll_interval)) + time.sleep(self.poll_interval) + + +class FatcatEntityUpdatesWorker(FatcatWorker): + """ + Consumes from the changelog topic and publishes expanded entities (fetched + from API) to update topics. + + For now, only release updates are published. + """ + + def __init__(self, api_host_url, kafka_hosts, consume_topic, release_topic): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + api_host_url=api_host_url) + self.release_topic = release_topic + self.consumer_group = "entity-updates" + + def run(self): + changelog_topic = self.kafka.topics[self.consume_topic] + release_topic = self.kafka.topics[self.release_topic] + + consumer = changelog_topic.get_balanced_consumer( + consumer_group=self.consumer_group, + managed=True, + auto_offset_reset=OffsetType.LATEST, + reset_offset_on_start=False, + ) + + with release_topic.get_sync_producer() as producer: + for msg in consumer: + cle = json.loads(msg.value.decode('utf-8')) + #print(cle) + release_edits = cle['editgroup']['edits']['releases'] + for re in release_edits: + ident = re['ident'] + release = self.api.get_release(ident, expand="files,container") + release_dict = self.api.api_client.sanitize_for_serialization(release) + producer.produce( + message=json.dumps(release_dict).encode('utf-8'), + partition_key=ident.encode('utf-8'), + timestamp=None, + ) + consumer.commit_offsets() + diff --git a/python/fatcat_tools/crossref_importer.py b/python/fatcat_tools/crossref_importer.py new file mode 100644 index 00000000..37005965 --- /dev/null +++ b/python/fatcat_tools/crossref_importer.py @@ -0,0 +1,272 @@ + +import sys +import json +import sqlite3 +import datetime +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + + +class FatcatCrossrefImporter(FatcatImporter): + + def __init__(self, host_url, issn_map_file, extid_map_file=None, create_containers=True): + super().__init__(host_url, issn_map_file) + self.extid_map_db = None + if extid_map_file: + db_uri = "file:{}?mode=ro".format(extid_map_file) + print("Using external ID map: {}".format(db_uri)) + self.extid_map_db = sqlite3.connect(db_uri, uri=True) + else: + print("Not using external ID map") + self.create_containers = create_containers + + def lookup_ext_ids(self, doi): + if self.extid_map_db is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + row = self.extid_map_db.execute("SELECT core, pmid, pmcid, wikidata FROM ids WHERE doi=? LIMIT 1", + [doi.lower()]).fetchone() + if row is None: + return dict(core_id=None, pmid=None, pmcid=None, wikidata_qid=None) + row = [str(cell or '') or None for cell in row] + return dict( + core_id=row[0], + pmid=row[1], + pmcid=row[2], + wikidata_qid=row[3]) + + def parse_crossref_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a ReleaseEntity + """ + + # This work is out of scope if it doesn't have authors and a title + if (not 'author' in obj) or (not 'title' in obj): + return None + + # Other ways to be out of scope (provisionally) + if (not 'type' in obj): + return None + + # contribs + def do_contribs(obj_list, ctype): + contribs = [] + for i, am in enumerate(obj_list): + creator_id = None + if 'ORCID' in am.keys(): + creator_id = self.lookup_orcid(am['ORCID'].split('/')[-1]) + # Sorry humans :( + if am.get('given') and am.get('family'): + raw_name = "{} {}".format(am['given'], am['family']) + elif am.get('family'): + raw_name = am['family'] + else: + # TODO: defaults back to a pseudo-null value + raw_name = am.get('given', '') + extra = dict() + if ctype == "author": + index = i + else: + index = None + if am.get('affiliation'): + # note: affiliation => affiliations + extra['affiliations'] = am.get('affiliation') + if am.get('sequence') and am.get('sequence') != "additional": + extra['sequence'] = am.get('sequence') + if not extra: + extra = None + contribs.append(fatcat_client.ReleaseContrib( + creator_id=creator_id, + index=index, + raw_name=raw_name, + role=ctype, + extra=extra)) + return contribs + contribs = do_contribs(obj['author'], "author") + contribs.extend(do_contribs(obj.get('editor', []), "editor")) + contribs.extend(do_contribs(obj.get('translator', []), "translator")) + + # container + issn = obj.get('ISSN', [None])[0] + issnl = self.issn2issnl(issn) + container_id = None + if issnl: + container_id = self.lookup_issnl(issnl) + publisher = obj.get('publisher') + + ce = None + if (container_id is None and self.create_containers and issnl != None + and obj.get('container-title') and len(obj['container-title']) > 0): + ce = fatcat_client.ContainerEntity( + issnl=issnl, + publisher=publisher, + name=obj['container-title'][0]) + + # references + refs = [] + for i, rm in enumerate(obj.get('reference', [])): + try: + year = int(rm.get('year')) + # NOTE: will need to update/config in the future! + # NOTE: are there crossref works with year < 100? + if year > 2025 or year < 100: + year = None + except: + year = None + extra = rm.copy() + if rm.get('DOI'): + extra['doi'] = rm.get('DOI').lower() + key = rm.get('key') + if key and key.startswith(obj['DOI'].upper()): + key = key.replace(obj['DOI'].upper() + "-", '') + key = key.replace(obj['DOI'].upper(), '') + container_name = rm.get('volume-title') + if not container_name: + container_name = rm.get('journal-title') + extra.pop('DOI', None) + extra.pop('key', None) + extra.pop('year', None) + extra.pop('volume-name', None) + extra.pop('journal-title', None) + extra.pop('title', None) + extra.pop('first-page', None) + extra.pop('doi-asserted-by', None) + if extra: + extra = dict(crossref=extra) + else: + extra = None + refs.append(fatcat_client.ReleaseRef( + index=i, + # doing lookups would be a second import pass + target_release_id=None, + key=key, + year=year, + container_name=container_name, + title=rm.get('title'), + locator=rm.get('first-page'), + # TODO: just dump JSON somewhere here? + extra=extra)) + + # abstracts + abstracts = [] + if obj.get('abstract') != None: + abstracts.append(fatcat_client.ReleaseEntityAbstracts( + mimetype="application/xml+jats", + content=obj.get('abstract'))) + + # extra fields + extra = dict() + for key in ('subject', 'type', 'license', 'alternative-id', + 'container-title', 'original-title', 'subtitle', 'archive', + 'funder', 'group-title'): + # TODO: unpack "container-title" array + val = obj.get(key) + if val: + extra[key] = val + if 'license' in extra and extra['license']: + for i in range(len(extra['license'])): + if 'start' in extra['license'][i]: + extra['license'][i]['start'] = extra['license'][i]['start']['date-time'] + if len(obj['title']) > 1: + extra['other-titles'] = obj['title'][1:] + # TODO: this should be top-level + extra['is_kept'] = len(obj.get('archive', [])) > 0 + + # ISBN + isbn13 = None + for raw in obj.get('ISBN', []): + # TODO: convert if not ISBN-13 format + if len(raw) == 17: + isbn13 = raw + break + + # release status + if obj['type'] in ('journal-article', 'conference-proceeding', 'book', + 'dissertation', 'book-chapter'): + release_status = "published" + else: + # unknown + release_status = None + + # external identifiers + extids = self.lookup_ext_ids(doi=obj['DOI'].lower()) + + # TODO: filter out huge releases; we'll get them later (and fix bug in + # fatcatd) + if max(len(contribs), len(refs), len(abstracts)) > 750: + return None + + # release date parsing is amazingly complex + release_date = obj['issued']['date-parts'][0] + if not release_date or not release_date[0]: + # got some NoneType, even though at least year is supposed to be set + release_date = None + elif len(release_date) == 3: + release_date = datetime.datetime(year=release_date[0], month=release_date[1], day=release_date[2]) + else: + # only the year is actually required; mangle to first day for date + # (TODO: something better?) + release_date = datetime.datetime(year=release_date[0], month=1, day=1) + # convert to string ISO datetime format (if not null) + if release_date: + release_date = release_date.isoformat() + "Z" + + re = fatcat_client.ReleaseEntity( + work_id=None, + title=obj['title'][0], + contribs=contribs, + refs=refs, + container_id=container_id, + publisher=publisher, + release_type=obj['type'], + release_status=release_status, + doi=obj['DOI'].lower(), + isbn13=isbn13, + core_id=extids['core_id'], + pmid=extids['pmid'], + pmcid=extids['pmcid'], + wikidata_qid=extids['wikidata_qid'], + release_date=release_date, + issue=obj.get('issue'), + volume=obj.get('volume'), + pages=obj.get('page'), + abstracts=abstracts, + extra=dict(crossref=extra)) + return (re, ce) + + def create_row(self, row, editgroup=None): + if row is None: + return + obj = json.loads(row) + entities = self.parse_crossref_dict(obj) + if entities is not None: + (re, ce) = entities + if ce is not None: + container = self.api.create_container(ce, editgroup=editgroup) + re.container_id = container.ident + self._issnl_id_map[ce.issnl] = container.ident + self.api.create_release(re, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + def create_batch(self, batch, editgroup=None): + """Current work/release pairing disallows batch creation of releases. + Could do batch work creation and then match against releases, but meh.""" + release_batch = [] + for row in batch: + if row is None: + continue + obj = json.loads(row) + entities = self.parse_crossref_dict(obj) + if entities is not None: + (re, ce) = entities + if ce is not None: + ce_eg = self.api.create_editgroup( + fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + container = self.api.create_container(ce, editgroup=ce_eg.id) + self.api.accept_editgroup(ce_eg.id) + re.container_id = container.ident + self._issnl_id_map[ce.issnl] = container.ident + release_batch.append(re) + self.api.create_release_batch(release_batch, autoaccept="true", editgroup=editgroup) + self.insert_count = self.insert_count + len(release_batch) diff --git a/python/fatcat_tools/elastic_workers.py b/python/fatcat_tools/elastic_workers.py new file mode 100644 index 00000000..3d2e9c39 --- /dev/null +++ b/python/fatcat_tools/elastic_workers.py @@ -0,0 +1,47 @@ + +import json +import time +import requests +from fatcat.worker_common import FatcatWorker +from fatcat_client.models import ReleaseEntity +from fatcat.entity_helpers import * +from pykafka.common import OffsetType + + +class FatcatElasticReleaseWorker(FatcatWorker): + """ + Consumes from release-updates topic and pushes into (presumably local) + elasticsearch. + + Uses a consumer group to manage offset. + """ + + def __init__(self, kafka_hosts, consume_topic, poll_interval=10.0, offset=None, + elastic_backend="http://localhost:9200", elastic_index="fatcat"): + super().__init__(kafka_hosts=kafka_hosts, + consume_topic=consume_topic, + api_host_url=None) + self.consumer_group = "elastic-updates" + self.elastic_backend = elastic_backend + self.elastic_index = elastic_index + + def run(self): + consume_topic = self.kafka.topics[self.consume_topic] + + consumer = consume_topic.get_balanced_consumer( + consumer_group=self.consumer_group, + managed=True, + ) + + for msg in consumer: + json_str = msg.value.decode('utf-8') + release = entity_from_json(json_str, ReleaseEntity) + #print(release) + elastic_endpoint = "{}/{}/release/{}".format( + self.elastic_backend, + self.elastic_index, + release.ident) + print("Updating document: {}".format(elastic_endpoint)) + resp = requests.post(elastic_endpoint, json=release.to_elastic_dict()) + assert resp.status_code in (200, 201) + consumer.commit_offsets() diff --git a/python/fatcat_tools/entity_helpers.py b/python/fatcat_tools/entity_helpers.py new file mode 100644 index 00000000..c454536b --- /dev/null +++ b/python/fatcat_tools/entity_helpers.py @@ -0,0 +1,100 @@ + +import collections +from fatcat_client.models import ReleaseEntity +from fatcat_client.api_client import ApiClient + +def entity_to_json(entity): + ac = ApiClient() + return ac.sanitize_for_serialization(entity) + +def entity_from_json(json_str, entity_type): + """ + Hack to take advantage of the code-generated deserialization code + """ + ac = ApiClient() + thing = collections.namedtuple('Thing', ['data']) + thing.data = json_str + return ac.deserialize(thing, entity_type) + +def release_elastic_dict(release): + """ + Converts from an entity model/schema to elasticsearch oriented schema. + + Returns: dict + """ + + if release.state != 'active': + raise ValueError("Entity is not 'active'") + + # First, the easy ones (direct copy) + t = dict( + ident = release.ident, + revision = release.revision, + title = release.title, + release_type = release.release_type, + release_status = release.release_status, + language = release.language, + doi = release.doi, + pmid = release.pmid, + pmcid = release.pmcid, + isbn13 = release.isbn13, + core_id = release.core_id, + wikidata_qid = release.wikidata_qid + ) + + if release.release_date: + # TODO: resolve why this can be either a string or datetime + if type(release.release_date) == str: + t['release_date'] = release.release_date + else: + t['release_date'] = release.release_date.strftime('%F') + + container = release.container + container_is_kept = False + if container: + t['publisher'] = container.publisher + t['container_name'] = container.name + t['container_issnl'] = container.issnl + container_extra = container.extra + if container_extra: + t['container_is_oa'] = container_extra.get('is_oa') + container_is_kept = container_extra.get('is_kept', False) + t['container_is_longtail_oa'] = container_extra.get('is_longtail_oa') + else: + t['publisher'] = release.publisher + + files = release.files or [] + t['file_count'] = len(files) + in_wa = False + in_ia = False + t['file_pdf_url'] = None + for f in files: + is_pdf = 'pdf' in f.get('mimetype', '') + for url in f.get('urls', []): + if url.get('rel', '') == 'webarchive': + in_wa = True + if '//web.archive.org/' in url['url'] or '//archive.org/' in url['url']: + in_ia = True + if is_pdf: + t['file_pdf_url'] = url['url'] + if not t['file_pdf_url'] and is_pdf: + t['file_pdf_url'] = url['url'] + t['file_in_webarchive'] = in_wa + t['file_in_ia'] = in_ia + + extra = release.extra or dict() + if extra: + t['in_shadow'] = extra.get('in_shadow') + if extra.get('grobid') and extra['grobid'].get('is_longtail_oa'): + t['container_is_longtail_oa'] = True + t['any_abstract'] = bool(release.abstracts) + t['is_kept'] = container_is_kept or extra.get('is_kept', False) + + t['ref_count'] = len(release.refs or []) + t['contrib_count'] = len(release.contribs or []) + contrib_names = [] + for c in (release.contribs or []): + if c.raw_name: + contrib_names.append(c.raw_name) + t['contrib_names'] = contrib_names + return t diff --git a/python/fatcat_tools/fcid.py b/python/fatcat_tools/fcid.py new file mode 100644 index 00000000..dd72b242 --- /dev/null +++ b/python/fatcat_tools/fcid.py @@ -0,0 +1,17 @@ + +import base64 +import uuid + +def fcid2uuid(s): + s = s.split('_')[-1].upper().encode('utf-8') + assert len(s) == 26 + raw = base64.b32decode(s + b"======") + return str(uuid.UUID(bytes=raw)).lower() + +def uuid2fcid(s): + raw = uuid.UUID(s).bytes + return base64.b32encode(raw)[:26].lower().decode('utf-8') + +def test_fcid(): + test_uuid = '00000000-0000-0000-3333-000000000001' + assert test_uuid == fcid2uuid(uuid2fcid(test_uuid)) diff --git a/python/fatcat_tools/grobid_metadata_importer.py b/python/fatcat_tools/grobid_metadata_importer.py new file mode 100755 index 00000000..95cc285e --- /dev/null +++ b/python/fatcat_tools/grobid_metadata_importer.py @@ -0,0 +1,168 @@ +#!/usr/bin/env python3 + +import sys +import json +import base64 +import datetime +import fatcat_client +from fatcat.importer_common import FatcatImporter + +MAX_ABSTRACT_BYTES=4096 + + +class FatcatGrobidMetadataImporter(FatcatImporter): + + def __init__(self, host_url, default_link_rel="web"): + super().__init__(host_url) + self.default_link_rel = default_link_rel + + def parse_grobid_json(self, obj): + + if not obj.get('title'): + return None + + release = dict() + extra = dict() + + if obj.get('abstract') and len(obj.get('abstract')) < MAX_ABSTRACT_BYTES: + abobj = dict( + mimetype="text/plain", + language=None, + content=obj.get('abstract').strip()) + abstracts = [abobj] + else: + abstracts = None + + contribs = [] + for i, a in enumerate(obj.get('authors', [])): + c = dict(raw_name=a['name'], role="author") + contribs.append(fatcat_client.ReleaseContrib( + index=i, + raw_name=a['name'], + role="author", + extra=None)) + + refs = [] + for raw in obj.get('citations', []): + cite_extra = dict() + ref = dict() + ref['key'] = raw.get('id') + if raw.get('title'): + ref['title'] = raw['title'].strip() + if raw.get('date'): + try: + year = int(raw['date'].strip()[:4]) + ref['year'] = year + except: + pass + for key in ('volume', 'url', 'issue', 'publisher'): + if raw.get(key): + cite_extra[key] = raw[key].strip() + if raw.get('authors'): + cite_extra['authors'] = [a['name'] for a in raw['authors']] + if cite_extra: + cite_extra = dict(grobid=cite_extra) + else: + cite_extra = None + ref['extra'] = cite_extra + refs.append(ref) + + release_type = "journal-article" + release_date = None + if obj.get('date'): + # TODO: only returns year, ever? how to handle? + release_date = datetime.datetime(year=int(obj['date'][:4]), month=1, day=1) + + if obj.get('doi'): + extra['doi'] = obj['doi'] + if obj['journal'] and obj['journal'].get('name'): + extra['container_name'] = obj['journal']['name'] + + extra['is_longtail_oa'] = True + + # TODO: ISSN/eISSN handling? or just journal name lookup? + + if extra: + extra = dict(grobid=extra) + else: + extra = None + + re = fatcat_client.ReleaseEntity( + title=obj['title'].strip(), + contribs=contribs, + refs=refs, + publisher=obj['journal'].get('publisher'), + volume=obj['journal'].get('volume'), + issue=obj['journal'].get('issue'), + abstracts=abstracts, + extra=extra) + return re + + # TODO: make this a common function somewhere + def make_url(self, raw): + rel = self.default_link_rel + # TODO: this is where we could map specific domains to rel types, + # and also filter out bad domains, invalid URLs, etc + if "//archive.org/" in raw or "//arxiv.org/" in raw: + # TODO: special-case the arxiv.org bulk mirror? + rel = "repository" + elif "//web.archive.org/" in raw or "//archive.is/" in raw: + rel = "webarchive" + return fatcat_client.FileEntityUrls(url=raw, rel=rel) + + def parse_file_metadata(self, sha1_key, cdx, mimetype, file_size): + + sha1 = base64.b16encode(base64.b32decode(sha1_key.replace('sha1:', ''))).decode('ascii').lower() + + # lookup existing SHA1, or create new entity + try: + existing_file = self.api.lookup_file(sha1=sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + existing_file = None + + if existing_file: + # if file is already in here, presumably not actually long-tail + return None + fe = fatcat_client.FileEntity( + sha1=sha1, + size=int(file_size), + mimetype=mimetype, + releases=[], + urls=[], + ) + + # parse URLs and CDX + original = cdx['url'] + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['dt'], + original) + fe.urls.append( + fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) + original_url = self.make_url(original) + if original_url != None: + fe.urls.append(original_url) + + return fe + + def create_row(self, row, editgroup=None): + if not row: + return + fields = row.split('\t') + sha1_key = fields[0] + cdx = json.loads(fields[1]) + mimetype = fields[2] + file_size = int(fields[3]) + grobid_meta = json.loads(fields[4]) + fe = self.parse_file_metadata(sha1_key, cdx, mimetype, file_size) + re = self.parse_grobid_json(grobid_meta) + if fe and re: + release_entity = self.api.create_release(re, editgroup=editgroup) + # release ident can't already be in release list because we just + # created it + fe.releases.append(release_entity.ident) + file_entity = self.api.create_file(fe, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + # NB: batch mode not implemented diff --git a/python/fatcat_tools/importer_common.py b/python/fatcat_tools/importer_common.py new file mode 100644 index 00000000..8dfee875 --- /dev/null +++ b/python/fatcat_tools/importer_common.py @@ -0,0 +1,137 @@ + +import re +import sys +import csv +import json +import itertools +import fatcat_client +from fatcat_client.rest import ApiException + +# from: https://docs.python.org/3/library/itertools.html +def grouper(iterable, n, fillvalue=None): + "Collect data into fixed-length chunks or blocks" + args = [iter(iterable)] * n + return itertools.zip_longest(*args, fillvalue=fillvalue) + +class FatcatImporter: + + def __init__(self, host_url, issn_map_file=None): + conf = fatcat_client.Configuration() + conf.host = host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self._issnl_id_map = dict() + self._orcid_id_map = dict() + self._doi_id_map = dict() + self._issn_issnl_map = None + self._orcid_regex = re.compile("^\\d{4}-\\d{4}-\\d{4}-\\d{3}[\\dX]$") + if issn_map_file: + self.read_issn_map_file(issn_map_file) + self.processed_lines = 0 + self.insert_count = 0 + self.update_count = 0 + + def describe_run(self): + print("Processed {} lines, inserted {}, updated {}.".format( + self.processed_lines, self.insert_count, self.update_count)) + + def process_source(self, source, group_size=100): + """Creates and auto-accepts editgroup every group_size rows""" + eg = self.api.create_editgroup( + fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + for i, row in enumerate(source): + self.create_row(row, editgroup=eg.id) + if i > 0 and (i % group_size) == 0: + self.api.accept_editgroup(eg.id) + eg = self.api.create_editgroup( + fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + self.processed_lines = self.processed_lines + 1 + if i == 0 or (i % group_size) != 0: + self.api.accept_editgroup(eg.id) + + def process_batch(self, source, size=50): + """Reads and processes in batches (not API-call-per-)""" + for rows in grouper(source, size): + self.processed_lines = self.processed_lines + len(rows) + eg = self.api.create_editgroup( + fatcat_client.Editgroup(editor_id='aaaaaaaaaaaabkvkaaaaaaaaae')) + self.create_batch(rows, editgroup=eg.id) + + def process_csv_source(self, source, group_size=100, delimiter=','): + reader = csv.DictReader(source, delimiter=delimiter) + self.process_source(reader, group_size) + + def process_csv_batch(self, source, size=50, delimiter=','): + reader = csv.DictReader(source, delimiter=delimiter) + self.process_batch(reader, size) + + def is_issnl(self, issnl): + return len(issnl) == 9 and issnl[4] == '-' + + def lookup_issnl(self, issnl): + """Caches calls to the ISSN-L lookup API endpoint in a local dict""" + if issnl in self._issnl_id_map: + return self._issnl_id_map[issnl] + container_id = None + try: + rv = self.api.lookup_container(issnl=issnl) + container_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._issnl_id_map[issnl] = container_id # might be None + return container_id + + def is_orcid(self, orcid): + return self._orcid_regex.match(orcid) != None + + def lookup_orcid(self, orcid): + """Caches calls to the Orcid lookup API endpoint in a local dict""" + if not self.is_orcid(orcid): + return None + if orcid in self._orcid_id_map: + return self._orcid_id_map[orcid] + creator_id = None + try: + rv = self.api.lookup_creator(orcid=orcid) + creator_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._orcid_id_map[orcid] = creator_id # might be None + return creator_id + + def is_doi(self, doi): + return doi.startswith("10.") and doi.count("/") >= 1 + + def lookup_doi(self, doi): + """Caches calls to the doi lookup API endpoint in a local dict""" + assert self.is_doi(doi) + doi = doi.lower() + if doi in self._doi_id_map: + return self._doi_id_map[doi] + release_id = None + try: + rv = self.api.lookup_release(doi=doi) + release_id = rv.ident + except ApiException as ae: + # If anything other than a 404 (not found), something is wrong + assert ae.status == 404 + self._doi_id_map[doi] = release_id # might be None + return release_id + + def read_issn_map_file(self, issn_map_file): + print("Loading ISSN map file...") + self._issn_issnl_map = dict() + for line in issn_map_file: + if line.startswith("ISSN") or len(line) == 0: + continue + (issn, issnl) = line.split()[0:2] + self._issn_issnl_map[issn] = issnl + # double mapping makes lookups easy + self._issn_issnl_map[issnl] = issnl + print("Got {} ISSN-L mappings.".format(len(self._issn_issnl_map))) + + def issn2issnl(self, issn): + if issn is None: + return None + return self._issn_issnl_map.get(issn) diff --git a/python/fatcat_tools/issn_importer.py b/python/fatcat_tools/issn_importer.py new file mode 100644 index 00000000..c9ef50b5 --- /dev/null +++ b/python/fatcat_tools/issn_importer.py @@ -0,0 +1,72 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +# CSV format (generated from git.archive.org/webgroup/oa-journal-analysis): +# ISSN-L,in_doaj,in_road,in_norwegian,in_crossref,title,publisher,url,lang,ISSN-print,ISSN-electronic,doi_count,has_doi,is_oa,is_kept,publisher_size,url_live,url_live_status,url_live_final_status,url_live_final_url,url_live_status_simple,url_live_final_status_simple,url_domain,gwb_pdf_count + +def or_none(s): + if s is None: + return None + if len(s) == 0: + return None + return s + +def truthy(s): + if s is None: + return None + s = s.lower() + if s in ('true', 't', 'yes', 'y', '1'): + return True + elif s in ('false', 'f', 'no', 'n', '0'): + return False + else: + return None + +class FatcatIssnImporter(FatcatImporter): + + def parse_issn_row(self, row): + """ + row is a python dict (parsed from CSV). + returns a ContainerEntity + """ + title = or_none(row['title']) + issnl = or_none(row['ISSN-L']) + if title is None or issnl is None: + return + extra = dict( + in_doaj=truthy(row['in_doaj']), + in_road=truthy(row['in_road']), + in_norwegian=truthy(row['in_norwegian']), + language=or_none(row['lang']), + url=or_none(row['url']), + ISSNp=or_none(row['ISSN-print']), + ISSNe=or_none(row['ISSN-electronic']), + is_oa=truthy(row['is_oa']), + is_kept=truthy(row['is_kept']), + ) + ce = fatcat_client.ContainerEntity( + issnl=issnl, + name=title, + publisher=or_none(row['publisher']), + abbrev=None, + coden=None, + extra=extra) + return ce + + def create_row(self, row, editgroup=None): + ce = self.parse_issn_row(row) + if ce is not None: + self.api.create_container(ce, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + def create_batch(self, batch, editgroup=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_issn_row(l) + for l in batch if l != None] + objects = [o for o in objects if o != None] + self.api.create_container_batch(objects, autoaccept="true", editgroup=editgroup) + self.insert_count = self.insert_count + len(objects) diff --git a/python/fatcat_tools/matched_importer.py b/python/fatcat_tools/matched_importer.py new file mode 100644 index 00000000..7f55369b --- /dev/null +++ b/python/fatcat_tools/matched_importer.py @@ -0,0 +1,144 @@ + +import sys +import json +import sqlite3 +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +#row = row.split('\t') +#assert len(row) == 2 +#sha1 = row[0].replace('sha1:') +#sha1 = base64.b16encode(base64.b32decode(sha1)).lower() +#print(sha1) +#dois = [d.lower() for d in json.loads(row[1])] + +class FatcatMatchedImporter(FatcatImporter): + """ + Input format is JSON with keys: + - dois (list) + - sha1 (hex) + - md5 (hex) + - sha256 (hex) + - size (int) + - cdx (list of objects) + - dt + - url + - mimetype + - urls (list of strings... or objects?) + + Future handlings/extensions: + - core_id, wikidata_id, pmcid, pmid: not as lists + """ + + def __init__(self, host_url, skip_file_update=False, default_mime=None, + default_link_rel="web"): + super().__init__(host_url) + self.default_mime = default_mime + self.default_link_rel = default_link_rel + self.skip_file_update = skip_file_update + + def make_url(self, raw): + rel = self.default_link_rel + # TODO: this is where we could map specific domains to rel types, + # and also filter out bad domains, invalid URLs, etc + if "//archive.org/" in raw or "//arxiv.org/" in raw: + # TODO: special-case the arxiv.org bulk mirror? + rel = "repository" + elif "//web.archive.org/" in raw or "//archive.is/" in raw: + rel = "webarchive" + return fatcat_client.FileEntityUrls(url=raw, rel=rel) + + def parse_matched_dict(self, obj): + sha1 = obj['sha1'] + dois = [d.lower() for d in obj.get('dois', [])] + + # lookup sha1, or create new entity + fe = None + if not self.skip_file_update: + try: + fe = self.api.lookup_file(sha1=sha1) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + if fe is None: + fe = fatcat_client.FileEntity( + sha1=sha1, + releases=[], + urls=[], + ) + + # lookup dois + re_list = set() + for doi in dois: + try: + re = self.api.lookup_release(doi=doi) + except fatcat_client.rest.ApiException as err: + if err.status != 404: + raise err + re = None + if re is None: + print("DOI not found: {}".format(doi)) + else: + re_list.add(re.ident) + if len(re_list) == 0: + return None + if fe.releases == set(re_list): + return None + re_list.update(fe.releases) + fe.releases = list(re_list) + + # parse URLs and CDX + existing_urls = [feu.url for feu in fe.urls] + for url in obj.get('url', []): + if url not in existing_urls: + url = self.make_url(url) + if url != None: + fe.urls.append(url) + for cdx in obj.get('cdx', []): + original = cdx['url'] + wayback = "https://web.archive.org/web/{}/{}".format( + cdx['dt'], + original) + if wayback not in existing_urls: + fe.urls.append( + fatcat_client.FileEntityUrls(url=wayback, rel="webarchive")) + if original not in existing_urls: + url = self.make_url(original) + if url != None: + fe.urls.append(url) + + if obj.get('size') != None: + fe.size = int(obj['size']) + fe.sha256 = obj.get('sha256', fe.sha256) + fe.md5 = obj.get('md5', fe.sha256) + if obj.get('mimetype') is None: + if fe.mimetype is None: + fe.mimetype = self.default_mime + else: + fe.mimetype = obj.get('mimetype') + return fe + + def create_row(self, row, editgroup=None): + obj = json.loads(row) + fe = self.parse_matched_dict(obj) + if fe is not None: + if fe.ident is None: + self.api.create_file(fe, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + else: + self.api.update_file(fe.ident, fe, editgroup=editgroup) + self.update_count = self.update_count + 1 + + def create_batch(self, batch, editgroup=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_matched_dict(json.loads(l)) + for l in batch if l != None] + new_objects = [o for o in objects if o != None and o.ident == None] + update_objects = [o for o in objects if o != None and o.ident != None] + for obj in update_objects: + self.api.update_file(obj.ident, obj, editgroup=editgroup) + if len(new_objects) > 0: + self.api.create_file_batch(new_objects, autoaccept="true", editgroup=editgroup) + self.update_count = self.update_count + len(update_objects) + self.insert_count = self.insert_count + len(new_objects) diff --git a/python/fatcat_tools/orcid_importer.py b/python/fatcat_tools/orcid_importer.py new file mode 100644 index 00000000..e1f5943c --- /dev/null +++ b/python/fatcat_tools/orcid_importer.py @@ -0,0 +1,73 @@ + +import sys +import json +import itertools +import fatcat_client +from fatcat.importer_common import FatcatImporter + +def value_or_none(e): + if type(e) == dict: + e = e.get('value') + if type(e) == str and len(e) == 0: + e = None + # TODO: this is probably bogus; patched in desperation; remove? + if e: + try: + e.encode() + except UnicodeEncodeError: + # Invalid JSON? + print("BAD UNICODE") + return None + return e + +class FatcatOrcidImporter(FatcatImporter): + + def parse_orcid_dict(self, obj): + """ + obj is a python dict (parsed from json). + returns a CreatorEntity + """ + name = obj['person']['name'] + if name is None: + return None + extra = None + given = value_or_none(name.get('given-names')) + sur = value_or_none(name.get('family-name')) + display = value_or_none(name.get('credit-name')) + if display is None: + # TODO: sorry human beings + if given and sur: + display = "{} {}".format(given, sur) + elif sur: + display = sur + elif given: + display = given + else: + # must have *some* name + return None + orcid = obj['orcid-identifier']['path'] + if not self.is_orcid(orcid): + sys.stderr.write("Bad ORCID: {}\n".format(orcid)) + return None + ce = fatcat_client.CreatorEntity( + orcid=orcid, + given_name=given, + surname=sur, + display_name=display, + extra=extra) + return ce + + def create_row(self, row, editgroup=None): + obj = json.loads(row) + ce = self.parse_orcid_dict(obj) + if ce is not None: + self.api.create_creator(ce, editgroup=editgroup) + self.insert_count = self.insert_count + 1 + + def create_batch(self, batch, editgroup=None): + """Reads and processes in batches (not API-call-per-line)""" + objects = [self.parse_orcid_dict(json.loads(l)) + for l in batch if l != None] + objects = [o for o in objects if o != None] + self.api.create_creator_batch(objects, autoaccept="true", editgroup=editgroup) + self.insert_count = self.insert_count + len(objects) diff --git a/python/fatcat_tools/raw_api_client.py b/python/fatcat_tools/raw_api_client.py new file mode 100644 index 00000000..75151ebb --- /dev/null +++ b/python/fatcat_tools/raw_api_client.py @@ -0,0 +1,66 @@ + +import sys +import json +import requests + + +class RawFatcatApiClient: + + def __init__(self, host_url): + self.host_url = host_url + self.session = requests.Session() + self._issn_map = dict() + + def get(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.get(self.host_url + path, json=data, + headers=headers) + + def post(self, path, data=None): + headers = {"content-type": "application/json"} + return self.session.post(self.host_url + path, json=data, + headers=headers) + + def new_editgroup(self): + rv = self.post('/v0/editgroup', data=dict( + editor_id=1)) + print(rv) + print(rv.json()) + assert rv.status_code == 201 + editgroup_id = rv.json()['id'] + return editgroup_id + + def accept_editgroup(self, eg): + rv = self.post('/v0/editgroup/{}/accept'.format(eg)) + assert rv.status_code == 200 + return rv + + def import_issn_file(self, json_file, create_containers=False, batchsize=100): + eg = self.new_editgroup() + i = 0 + with open(json_file, 'r') as file: + for line in file: + if i % batchsize == 0: + sys.stdout.write('\n{}: '.format(i)) + if (i+1) % 20 == 0: + sys.stdout.write('.') + i = i + 1 + obj = json.loads(line) + if not ("author" in obj and "title" in obj): + continue + try: + self.import_crossref_dict(obj, editgroup=eg, + create_containers=create_containers) + except Exception as e: + print("ERROR: {}".format(e)) + if i % batchsize == 0: + self.accept_editgroup(eg) + eg = self.new_editgroup() + if i % batchsize != 0: + self.accept_editgroup(eg) + print("done!") + + def health(self): + rv = self.get("/health") + assert rv.status_code == 200 + return rv.json() diff --git a/python/fatcat_tools/worker_common.py b/python/fatcat_tools/worker_common.py new file mode 100644 index 00000000..77ea2c15 --- /dev/null +++ b/python/fatcat_tools/worker_common.py @@ -0,0 +1,25 @@ + +import re +import sys +import csv +import json +import itertools +import fatcat_client +from pykafka import KafkaClient +from fatcat_client.rest import ApiException + + +class FatcatWorker: + """ + Common code for for Kafka producers and consumers. + """ + + def __init__(self, kafka_hosts, produce_topic=None, consume_topic=None, api_host_url=None): + if api_host_url: + conf = fatcat_client.Configuration() + conf.host = api_host_url + self.api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + self.kafka = KafkaClient(hosts=kafka_hosts, broker_version="1.0.0") + self.produce_topic = produce_topic + self.consume_topic = consume_topic + diff --git a/python/fatcat_web/__init__.py b/python/fatcat_web/__init__.py new file mode 100644 index 00000000..aa12f972 --- /dev/null +++ b/python/fatcat_web/__init__.py @@ -0,0 +1,18 @@ + +from flask import Flask +from flask_uuid import FlaskUUID +from flask_debugtoolbar import DebugToolbarExtension +from config import Config +import fatcat_client + +toolbar = DebugToolbarExtension() +app = Flask(__name__) +app.config.from_object(Config) +toolbar = DebugToolbarExtension(app) +FlaskUUID(app) + +conf = fatcat_client.Configuration() +conf.host = "http://localhost:9411/v0" +api = fatcat_client.DefaultApi(fatcat_client.ApiClient(conf)) + +from fatcat import routes diff --git a/python/fatcat_web/routes.py b/python/fatcat_web/routes.py new file mode 100644 index 00000000..ddb56abd --- /dev/null +++ b/python/fatcat_web/routes.py @@ -0,0 +1,364 @@ + +import os +import json +from flask import Flask, render_template, send_from_directory, request, \ + url_for, abort, g, redirect, jsonify, session +from fatcat import app, api +from fatcat_client.rest import ApiException +from fatcat.search import do_search + + +### Views ################################################################### + +@app.route('/container//history', methods=['GET']) +def container_history(ident): + try: + entity = api.get_container(ident) + history = api.get_container_history(ident) + except ApiException as ae: + abort(ae.status) + #print(history) + return render_template('entity_history.html', + page_title=entity.name, + entity_type="container", + entity=entity, + history=history) + +@app.route('/container//edit', methods=['GET']) +def container_edit_view(ident): + try: + entity = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_edit.html') + +#@app.route('/container//edit', methods=['POST']) +#def container_edit(ident): +# raise NotImplemented() +# params = dict() +# for k in request.form: +# if k.startswith('container_'): +# params[k[10:]] = request.form[k] +# edit = api.update_container(params=params) +# return redirect("/container/{}".format(edit.ident)) +# # else: +# #return render_template('container_edit.html') + +@app.route('/container/create', methods=['GET']) +def container_create_view(): + return render_template('container_create.html') + +@app.route('/container/create', methods=['POST']) +def container_create(): + params = dict() + for k in request.form: + if k.startswith('container_'): + params[k[10:]] = request.form[k] + edit = api.create_container(params=params) + return redirect("/container/{}".format(edit.ident)) + +@app.route('/container/lookup', methods=['GET']) +def container_lookup(): + issnl = request.args.get('issnl') + if issnl is None: + abort(400) + try: + resp = api.lookup_container(issnl) + except ApiException as ae: + abort(ae.status) + return redirect('/container/{}'.format(resp.ident)) + +@app.route('/container/', methods=['GET']) +def container_view(ident): + try: + entity = api.get_container(ident) + except ApiException as ae: + abort(ae.status) + return render_template('container_view.html', container=entity) + +@app.route('/creator//history', methods=['GET']) +def creator_history(ident): + try: + entity = api.get_creator(ident) + history = api.get_creator_history(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_history.html', + page_title=entity.display_name, + entity_type="creator", + entity=entity, + history=history) + +@app.route('/creator//edit', methods=['GET']) +def creator_edit_view(ident): + try: + entity = api.get_creator(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_edit.html') + +@app.route('/creator/lookup', methods=['GET']) +def creator_lookup(): + orcid = request.args.get('orcid') + if orcid is None: + abort(400) + try: + resp = api.lookup_creator(orcid) + except ApiException as ae: + abort(ae.status) + return redirect('/creator/{}'.format(resp.ident)) + +@app.route('/creator/', methods=['GET']) +def creator_view(ident): + try: + entity = api.get_creator(ident) + releases = api.get_creator_releases(ident) + except ApiException as ae: + abort(ae.status) + return render_template('creator_view.html', creator=entity, releases=releases) + +@app.route('/file//history', methods=['GET']) +def file_history(ident): + try: + entity = api.get_file(ident) + history = api.get_file_history(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_history.html', + page_title=None, + entity_type="file", + entity=entity, + history=history) + +@app.route('/file//edit', methods=['GET']) +def file_edit_view(ident): + try: + entity = api.get_file(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_edit.html') + +@app.route('/file/lookup', methods=['GET']) +def file_lookup(): + sha1 = request.args.get('sha1') + if sha1 is None: + abort(400) + try: + resp = api.lookup_file(sha1) + except ApiException as ae: + abort(ae.status) + return redirect('/file/{}'.format(resp.ident)) + +@app.route('/file/', methods=['GET']) +def file_view(ident): + try: + entity = api.get_file(ident) + except ApiException as ae: + abort(ae.status) + return render_template('file_view.html', file=entity) + +@app.route('/release/lookup', methods=['GET']) +def release_lookup(): + doi = request.args.get('doi') + if doi is None: + abort(400) + try: + resp = api.lookup_release(doi) + except ApiException as ae: + abort(ae.status) + return redirect('/release/{}'.format(resp.ident)) + +@app.route('/release/create', methods=['GET']) +def release_create_view(): + return render_template('release_create.html') + +@app.route('/release/create', methods=['POST']) +def release_create(): + params = dict() + for k in request.form: + if k.startswith('release_'): + params[k[10:]] = request.form[k] + edit = api.create_release(params=params) + return redirect("/release/{}".format(edit.ident)) + +@app.route('/release//history', methods=['GET']) +def release_history(ident): + try: + entity = api.get_release(ident) + history = api.get_release_history(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_history.html', + page_title=entity.title, + entity_type="release", + entity=entity, + history=history) + +@app.route('/release//edit', methods=['GET']) +def release_edit_view(ident): + try: + entity = api.get_release(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_edit.html') + +@app.route('/release/', methods=['GET']) +def release_view(ident): + try: + entity = api.get_release(ident) + files = api.get_release_files(ident) + container = None + if entity.container_id is not None: + container = api.get_container(entity.container_id) + except ApiException as ae: + abort(ae.status) + authors = [c for c in entity.contribs if c.role in ('author', None)] + authors = sorted(authors, key=lambda c: c.index) + for fe in files: + # crudely filter out exact duplicates + kept = [] + for u in fe.urls: + if not u in kept: + kept.append(u) + fe.urls = [u for u in kept if not '/web/None/' in u.url] + return render_template('release_view.html', release=entity, + authors=authors, files=files, container=container) + +@app.route('/work/create', methods=['GET']) +def work_create_view(): + return abort(404) + +@app.route('/work//history', methods=['GET']) +def work_history(ident): + try: + entity = api.get_work(ident) + history = api.get_work_history(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_history.html', + page_title=None, + entity_type="work", + entity=entity, + history=history) + +@app.route('/work//edit', methods=['GET']) +def work_edit_view(ident): + try: + entity = api.get_work(ident) + except ApiException as ae: + abort(ae.status) + return render_template('entity_edit.html') + +@app.route('/work/', methods=['GET']) +def work_view(ident): + try: + entity = api.get_work(ident) + releases = api.get_work_releases(ident) + except ApiException as ae: + abort(ae.status) + return render_template('work_view.html', work=entity, releases=releases) + +@app.route('/editgroup/current', methods=['GET']) +def editgroup_current(): + raise NotImplemented() + #eg = api.get_or_create_editgroup() + #return redirect('/editgroup/{}'.format(eg.id)) + +@app.route('/editgroup/', methods=['GET']) +def editgroup_view(ident): + try: + entity = api.get_editgroup(str(ident)) + except ApiException as ae: + abort(ae.status) + return render_template('editgroup_view.html', editgroup=entity) + +@app.route('/editor/', methods=['GET']) +def editor_view(ident): + entity = api.get_editor(ident) + return render_template('editor_view.html', editor=entity) + +@app.route('/editor//changelog', methods=['GET']) +def editor_changelog(ident): + editor = api.get_editor(ident) + changelog_entries = api.get_editor_changelog(ident) + return render_template('editor_changelog.html', editor=editor, + changelog_entries=changelog_entries) + +@app.route('/changelog', methods=['GET']) +def changelog_view(): + try: + entries = api.get_changelog(limit=request.args.get('limit')) + except ApiException as ae: + abort(ae.status) + return render_template('changelog.html', entries=entries) + +@app.route('/changelog/', methods=['GET']) +def changelog_entry_view(index): + try: + entry = api.get_changelog_entry(int(index)) + except ApiException as ae: + abort(ae.status) + return render_template('changelog_view.html', entry=entry, editgroup=entry.editgroup) + +@app.route('/stats', methods=['GET']) +def stats_view(): + stats = api.get_stats() + return render_template('stats.html', stats=stats.extra) + +### Search ################################################################## + +@app.route('/release/search', methods=['GET', 'POST']) +def search(): + + limit = 20 + query = request.args.get('q') + fulltext_only = bool(request.args.get('fulltext_only')) + + # Convert raw DOIs to DOI queries + if query is not None: + oldquery = query.split() + for word in oldquery: + if word.startswith("10.") and word.count("/") >= 1: + query = query.replace(word, 'doi:"{}"'.format(word)) + + if 'q' in request.args.keys(): + # always do files for HTML + found = do_search(query, limit=limit, fulltext_only=fulltext_only) + return render_template('release_search.html', found=found, query=query, fulltext_only=fulltext_only) + else: + return render_template('release_search.html', query=query, fulltext_only=fulltext_only) + + +### Static Routes ########################################################### + +@app.errorhandler(404) +def page_not_found(e): + return render_template('404.html'), 404 + +@app.route('/', methods=['GET']) +def homepage(): + return render_template('home.html') + +@app.route('/about', methods=['GET']) +def aboutpage(): + return render_template('about.html') + +@app.route('/search', methods=['GET']) +def search_redirect(): + return redirect("/release/search") + +@app.route('/robots.txt', methods=['GET']) +def robots(): + return send_from_directory(os.path.join(app.root_path, 'static'), + 'robots.txt', + mimetype='text/plain') + +@app.route('/static/fatcat.jpg', methods=['GET']) +def fatcat_photo(): + return send_from_directory(os.path.join(app.root_path, 'static'), + 'fatcat.jpg', + mimetype='image/jpeg') + +@app.route('/health', methods=['GET']) +def health(): + return jsonify({'ok': True}) diff --git a/python/fatcat_web/search.py b/python/fatcat_web/search.py new file mode 100644 index 00000000..b6826110 --- /dev/null +++ b/python/fatcat_web/search.py @@ -0,0 +1,60 @@ + +import requests +from flask import abort +from fatcat import app + + +def do_search(q, limit=50, fulltext_only=True): + + #print("Search hit: " + q) + if limit > 100: + # Sanity check + limit = 100 + + if fulltext_only: + q += " file_in_ia:true" + + search_request = { + "query": { + "query_string": { + "query": q, + "analyzer": "textIcuSearch", + "default_operator": "AND", + "analyze_wildcard": True, + "lenient": True, + "fields": ["title^5", "contrib_names^2", "container_title"] + }, + }, + "size": int(limit), + } + + #print(search_request) + resp = requests.get("%s/%s/_search" % + (app.config['ELASTIC_BACKEND'], app.config['ELASTIC_INDEX']), + json=search_request) + + if resp.status_code != 200: + print("elasticsearch non-200 status code: " + str(resp.status_code)) + print(resp.content) + abort(resp.status_code) + + content = resp.json() + #print(content) + results = [h['_source'] for h in content['hits']['hits']] + for h in results: + # Ensure 'contrib_names' is a list, not a single string + if type(h['contrib_names']) is not list: + h['contrib_names'] = [h['contrib_names'], ] + # Handle surrogate strings that elasticsearch returns sometimes, + # probably due to mangled data processing in some pipeline. + # "Crimes against Unicode"; production workaround + for key in h: + if type(h[key]) is str: + h[key] = h[key].encode('utf8', 'ignore').decode('utf8') + h['contrib_names'] = [name.encode('utf8', 'ignore').decode('utf8') for name in h['contrib_names']] + + found = content['hits']['total'] + return {"query": { "q": q }, + "count_returned": len(results), + "count_found": found, + "results": results } diff --git a/python/fatcat_web/static/fatcat.jpg b/python/fatcat_web/static/fatcat.jpg new file mode 100644 index 00000000..ad100381 Binary files /dev/null and b/python/fatcat_web/static/fatcat.jpg differ diff --git a/python/fatcat_web/static/robots.txt b/python/fatcat_web/static/robots.txt new file mode 100644 index 00000000..a168f11b --- /dev/null +++ b/python/fatcat_web/static/robots.txt @@ -0,0 +1 @@ +# Hello friends! diff --git a/python/fatcat_web/templates/404.html b/python/fatcat_web/templates/404.html new file mode 100644 index 00000000..c8fbfeac --- /dev/null +++ b/python/fatcat_web/templates/404.html @@ -0,0 +1,6 @@ +{% extends "base.html" %} +{% block body %} + +

404: Not Found

+ +{% endblock %} diff --git a/python/fatcat_web/templates/about.html b/python/fatcat_web/templates/about.html new file mode 100644 index 00000000..85f100b7 --- /dev/null +++ b/python/fatcat_web/templates/about.html @@ -0,0 +1,190 @@ +{% extends "base.html" %} +{% block body %} + +

fatcat Design Document (RFC)

+

Contact: Bryan Newbold bnewbold@archive.org. Last updated 2018-08-10

+

fatcat is a proposed open bibliographic catalog of written works. The scope of works is somewhat flexible, with a focus on published research outputs like journal articles, pre-prints, and conference proceedings. Records are collaboratively editable, versioned, available in bulk form, and include URL-agnostic file-level metadata.

+

fatcat is currently used internally at the Internet Archive, but interested folks are welcome to contribute to design and development.

+

Goals and Ecosystem Niche

+

For the Internet Archive use case, fatcat has two primary use cases:

+
    +
  • Track the "completeness" of our holdings against all known published works. In particular, allow us to monitor and prioritize further collection work.
  • +
  • Be a public-facing catalog and access mechanism for our open access holdings.
  • +
+

In the larger ecosystem, fatcat could also provide:

+
    +
  • A work-level (as opposed to title-level) archival dashboard: what fraction of all published works are preserved in archives? KBART, CLOCKSS, Portico, and other preservations don't provide granular metadata
  • +
  • A collaborative, independent, non-commercial, fully-open, field-agnostic, "completeness"-oriented catalog of scholarly metadata
  • +
  • Unified (centralized) foundation for discovery and access across repositories and archives: discovery projects can focus on user experience instead of building their own catalog from scratch
  • +
  • Research corpus for meta-science, with an emphasis on availability and reproducibility (metadata corpus itself is open access, and file-level hashes control for content drift)
  • +
  • Foundational infrastructure for distributed digital preservation
  • +
  • On-ramp for non-traditional digital works ("grey literature") into the scholarly web
  • +
+

Technical Architecture

+

The canonical backend datastore exposes a microservice-like HTTP API, which could be extended with gRPC or GraphQL interfaces. The initial datastore is a transactional SQL database, but this implementation detail is abstracted by the API.

+

As little "application logic" as possible should be embedded in this back-end; as much as possible would be pushed to bots which could be authored and operated by anybody. A separate web interface project talks to the API backend and can be developed more rapidly with less concern about data loss or corruption.

+

A cronjob will creae periodic database dumps, both in "full" form (all tables and all edit history, removing only authentication credentials) and "flattened" form (with only the most recent version of each entity).

+

A goal is to be linked-data/RDF/JSON-LD/semantic-web "compatible", but not necessarily "first". It should be possible to export the database in a relatively clean RDF form, and to fetch data in a variety of formats, but internally fatcat will not be backed by a triple-store, and will not be bound to a rigid third-party ontology or schema.

+

Microservice daemons should be able to proxy between the primary API and standard protocols like ResourceSync and OAI-PMH, and third party bots could ingest or synchronize the databse in those formats.

+

Licensing

+

The core fatcat database should only contain verifiable factual statements (which isn't to say that all statements are "true"), not creative or derived content.

+

The goal is to have a very permissively licensed database: CC-0 (no rights reserved) if possible. Under US law, it should be possible to scrape and pull in factual data from other corpuses without adopting their licenses. The goal here isn't to avoid attribution (progeny information will be included, and a large sources and acknowledgments statement should be maintained and shipped with bulk exports), but trying to manage the intersection of all upstream source licenses seems untenable, and creates burdens for downstream users and developers.

+

Special care will need to be taken around copyright, "original work" by editors, and contributions that raise privacy concerns. If abstracts are stored at all, they should be in a partitioned database table to prevent copyright contamination. Likewise, even simple user-created content like lists, reviews, ratings, comments, discussion, documentation, etc., should live in separate services.

+

Basic Editing Workflow and Bots

+

Both human editors and bots should have edits go through the same API, with humans using either the default web interface, integrations, or client software.

+

The normal workflow is to create edits (or updates, merges, deletions) on individual entities. Individual changes are bundled into an "edit group" of related edits (eg, correcting authorship info for multiple works related to a single author). When ready, the editor would "submit" the edit group for review. During the review period, human editors vote and bots can perform automated checks. During this period the editor can make tweaks if necessary. After some fixed time period (72 hours?) with no changes and no blocking issues, the edit group would be auto-accepted if no merge conflicts have be created by other edits to the same entities. This process balances editing labor (reviews are easy, but optional) against quality (cool-down period makes it easier to detect and prevent spam or out-of-control bots). More sophisticated roles and permissions could allow some certain humans and bots to push through edits more rapidly (eg, importing new works from a publisher API).

+

Bots need to be tuned to have appropriate edit group sizes (eg, daily batches, instead of millions of works in a single edit) to make human QA review and reverts managable.

+

Data progeny and source references are captured in the edit metadata, instead of being encoded in the entity data model itself. In the case of importing external databases, the expectation is that special-purpose bot accounts are be used, and tag timestamps and external identifiers in the edit metadata. Human editors would leave edit messages to clarify their sources.

+

A style guide (wiki) and discussion forum would be hosted as separate stand-alone services for editors to propose projects and debate process or scope changes. These services should have unified accounts and logins (oauth?) to have consistent account IDs across all mediums.

+

Global Edit Changelog

+

As part of the process of "accepting" an edit group, a row would be written to an immutable, append-only log table (which internally could be a SQL table) documenting each identifier change. This changelog establishes a monotonically increasing version number for the entire corpus, and should make interaction with other systems easier (eg, search engines, replicated databases, alternative storage backends, notification frameworks, etc.).

+

Identifiers

+

A fixed number of first-class "entities" are defined, with common behavior and schema layouts. These are all be semantic entities like "work", "release", "container", and "creator".

+

fatcat identifiers are semantically meaningless fixed-length random numbers, usually represented in case-insensitive base32 format. Each entity type has its own identifier namespace.

+

128-bit (UUID size) identifiers encode as 26 characters (but note that not all such strings decode to valid UUIDs), and in the backend can be serialized in UUID columns:

+
work_rzga5b9cd7efgh04iljk8f3jvz
+https://fatcat.wiki/work/rzga5b9cd7efgh04iljk8f3jvz
+

In comparison, 96-bit identifiers would have 20 characters and look like:

+
work_rzga5b9cd7efgh04iljk
+https://fatcat.wiki/work/rzga5b9cd7efgh04iljk
+

A 64-bit namespace would probably be large enought, and would work with database Integer columns:

+
work_rzga5b9cd7efg
+https://fatcat.wiki/work/rzga5b9cd7efg
+

The idea would be to only have fatcat identifiers be used to interlink between databases, not to supplant DOIs, ISBNs, handle, ARKs, and other "registered" persistent identifiers.

+

Entities and Internal Schema

+

Internally, identifiers would be lightweight pointers to "revisions" of an entity. Revisions are stored in their complete form, not as a patch or difference; if comparing to distributed version control systems, this is the git model, not the mercurial model.

+

The entity revisions are immutable once accepted; the editting process involves the creation of new entity revisions and, if the edit is approved, pointing the identifier to the new revision. Entities cross-reference between themselves by identifier not revision number. Identifier pointers also support (versioned) deletion and redirects (for merging entities).

+

Edit objects represent a change to a single entity; edits get batched together into edit groups (like "commits" and "pull requests" in git parlance).

+

SQL tables would probably look something like the (but specific to each entity type, with tables like work_revision not entity_revision):

+
entity_ident
+    id (uuid)
+    current_revision (entity_revision foreign key)
+    redirect_id (optional; points to another entity_ident)
+
+entity_revision
+    revision_id
+    <entity-specific fields>
+    extra: json blob for schema evolution
+
+entity_edit
+    timestamp
+    editgroup_id
+    ident (entity_ident foreign key)
+    new_revision (entity_revision foreign key)
+    previous_revision (optional; points to entity_revision)
+    extra: json blob for progeny metadata
+
+editgroup
+    editor_id
+    description
+    extra: json blob for progeny metadata
+

Additional entity-specific columns would hold actual metadata. Additional tables (which would reference both entity_revision and entity_id foreign keys as appropriate) would represent things like authorship relationships (creator/release), citations between works, etc. Every revision of an entity would require duplicating all of these associated rows, which could end up being a large source of inefficiency, but is necessary to represent the full history of an object.

+

Scope

+

The goal is to capture the "scholarly web": the graph of written works that cite other works. Any work that is both cited more than once and cites more than one other work in the catalog is very likely to be in scope. "Leaf nodes" and small islands of intra-cited works may or may not be in scope.

+

Overall focus is on written works, with some exceptions. The expected core focus (for which we would pursue "completeness") is:

+
journal articles
+academic books
+conference proceedings
+technical memos
+dissertations
+monographs
+well-researched blog posts
+web pages (that have citations)
+"white papers"
+

Possibly in scope:

+
reports
+magazine articles
+essays
+notable mailing list postings
+government documents
+presentations (slides, video)
+datasets
+well-researched wiki pages
+patents
+

Probably not:

+
court cases and legal documents
+newspaper articles
+social media
+manuals
+datasheets
+courses
+published poetry
+

Definitely not:

+
audio recordings
+tv show episodes
+musical scores
+advertisements
+

Author, citation, and work disambiguation would be core tasks. Linking pre-prints to final publication is in scope.

+

I'm much less interested in altmetrics, funding, and grant relationships than most existing databases in this space.

+

fatcat would not include any fulltext content itself, even for cleanly licensed (open access) works, but would have "strong" (verified) links to fulltext content, and would include file-level metadata (like hashes and fingerprints) to help discovery and identify content from any source. File-level URLs with context ("repository", "author-homepage", "web-archive") should make fatcat more useful for both humans and machines to quickly access fulltext content of a given mimetype than existing redirect or landing page systems. So another factor in deciding scope is whether a work has "digital fixity" and can be contained in a single immutable file.

+

Ontology

+

Loosely following FRBR (Functional Requirements for Bibliographic Records), but removing the "manifestation" abstraction, and favoring files (digital artifacts) over physical items, the primary entities are:

+
work
+    <a stub, for grouping releases>
+
+release (aka "edition", "variant")
+    title
+    volume/pages/issue/chapter
+    media/formfactor
+    publication/peer-review status
+    language
+    <published> date
+    <variant-of> work
+    <published-in> container
+    <has-contributors> creator
+    <citation-to> release
+    <has> identifier
+
+file (aka "digital artifact")
+    <instantiates> release
+    hashes/checksums
+    mimetype
+    <found-at> URLs
+
+creator (aka "author")
+    name
+    identifiers
+    aliases
+
+container (aka "venue", "serial", "title")
+    name
+    open-access policy
+    peer-review policy
+    <has> aliases, acronyms
+    <about> subject/category
+    <has> identifier
+    <published-in> container
+    <published-by> publisher
+

Controlled Vocabularies

+

Some special namespace tables and enums would probably be helpful; these could live in the database (not requiring a database migration to update), but should have more controlled editing workflow... perhaps versioned in the codebase:

+
    +
  • identifier namespaces (DOI, ISBN, ISSN, ORCID, etc; but not the identifers themselves)
  • +
  • subject categorization
  • +
  • license and open access status
  • +
  • work "types" (article vs. book chapter vs. proceeding, etc)
  • +
  • contributor types (author, translator, illustrator, etc)
  • +
  • human languages
  • +
  • file mimetypes
  • +
+

These could also be enforced by QA bots that review all editgroups.

+

Unresolved Questions

+

How to handle translations of, eg, titles and author names? To be clear, not translations of works (which are just separate releases), these are more like aliases or "originally known as".

+

Are bi-directional links a schema anti-pattern? Eg, should "work" point to a "primary release" (which itself points back to the work)?

+

Should identifier and citation be their own entities, referencing other entities by UUID instead of by revision? Not sure if this would increase or decrease database resource utilization.

+

Should contributor/author affiliation and contact information be retained? It could be very useful for disambiguation, but we don't want to build a huge database for spammers or "innovative" start-up marketing.

+

Can general-purpose SQL databases like Postgres or MySQL scale well enough to hold several tables with billions of entity revisions? Right from the start there are hundreds of millions of works and releases, many of which having dozens of citations, many authors, and many identifiers, and then we'll have potentially dozens of edits for each of these, which multiply out to 1e8 * 2e1 * 2e1 = 4e10, or 40 billion rows in the citation table. If each row was 32 bytes on average (uncompressed, not including index size), that would be 1.3 TByte on its own, larger than common SSD disks. I do think a transactional SQL datastore is the right answer. In my experience locking and index rebuild times are usually the biggest scaling challenges; the largely-immutable architecture here should mitigate locking. Hopefully few indexes would be needed in the primary database, as user interfaces could rely on secondary read-only search engines for more complex queries and views.

+

I see a tension between focus and scope creep. If a central database like fatcat doesn't support enough fields and metadata, then it will not be possible to completely import other corpuses, and this becomes "yet another" partial bibliographic database. On the other hand, accepting arbitrary data leads to other problems: sparseness increases (we have more "partial" data), potential for redundancy is high, humans will start editing content that might be bulk-replaced, etc.

+

There might be a need to support "stub" references between entities. Eg, when adding citations from PDF extraction, the cited works are likely to be ambiguous. Could create "stub" works to be merged/resolved later, or could leave the citation hanging. Same with authors, containers (journals), etc.

+

References and Previous Work

+

The closest overall analog of fatcat is MusicBrainz, a collaboratively edited music database. Open Library is a very similar existing service, which exclusively contains book metadata.

+

Wikidata seems to be the most successful and actively edited/developed open bibliographic database at this time (early 2018), including the wikicite conference and related Wikimedia/Wikipedia projects. Wikidata is a general purpose semantic database of entities, facts, and relationships; bibliographic metadata has become a large fraction of all content in recent years. The focus there seems to be linking knowledge (statements) to specific sources unambiguously. Potential advantages fatcat would have would be a focus on a specific scope (not a general-purpose database of entities) and a goal of completeness (capturing as many works and relationships as rapidly as possible). However, it might be better to just pitch in to the wikidata efforts.

+

The technical design of fatcat is loosely inspired by the git branch/tag/commit/tree architecture, and specifically inspired by Oliver Charles' "New Edit System" blog posts from 2012.

+

There are a whole bunch of proprietary, for-profit bibliographic databases, including Web of Science, Google Scholar, Microsoft Academic Graph, aminer, Scopus, and Dimensions. There are excellent field-limited databases like dblp, MEDLINE, and Semantic Scholar. There are some large general-purpose databases that are not directly user-editable, including the OpenCitation corpus, CORE, BASE, and CrossRef. I don't know of any large (more than 60 million works), open (bulk-downloadable with permissive or no license), field agnostic, user-editable corpus of scholarly publication bibliographic metadata.

+

RFC Changelog

+
    +
  • 2017-12-16: early notes
  • +
  • 2018-01-17: initial RFC document
  • +
  • 2018-08-10: updates from implementation work
  • +
+ +{% endblock %} diff --git a/python/fatcat_web/templates/base.html b/python/fatcat_web/templates/base.html new file mode 100644 index 00000000..856a6e03 --- /dev/null +++ b/python/fatcat_web/templates/base.html @@ -0,0 +1,78 @@ + + + + + + + {% block title %}fatcat!{% endblock %} + + + + + + + + + + + + +
+{% block fullbody %} +
+ {% block body %}Nothing to see here.{% endblock %} +
+{% endblock %} +
+ + + + +{% block postscript %}{% endblock %} + + + diff --git a/python/fatcat_web/templates/changelog.html b/python/fatcat_web/templates/changelog.html new file mode 100644 index 00000000..f33fe7c8 --- /dev/null +++ b/python/fatcat_web/templates/changelog.html @@ -0,0 +1,25 @@ +{% extends "base.html" %} +{% block body %} + +

Recent Changes +
changelog

+ +Limited to the most recent ~50 entries. + + + + {% for entry in entries %} +
Changelog
Index +
Timestamp (UTC) + Editgroup + Editor + Description +
{{ entry.index }} + {{ entry.timestamp }} + {{ entry.editgroup_id }} + {{ entry.editgroup.editor_id }} + {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} + {% endfor %} +
+ +{% endblock %} diff --git a/python/fatcat_web/templates/changelog_view.html b/python/fatcat_web/templates/changelog_view.html new file mode 100644 index 00000000..22aff9bc --- /dev/null +++ b/python/fatcat_web/templates/changelog_view.html @@ -0,0 +1,13 @@ +{% extends "editgroup_view.html" %} +{% block editgroupheader %} + +

Changelog Entry +
+ changelog {{ entry.index }} +
+

+ +
Timestamp: {{ entry.timestamp }} +
Editgroup: {{ editgroup.id }} + +{% endblock %} diff --git a/python/fatcat_web/templates/container_create.html b/python/fatcat_web/templates/container_create.html new file mode 100644 index 00000000..15288142 --- /dev/null +++ b/python/fatcat_web/templates/container_create.html @@ -0,0 +1,168 @@ +{% extends "base.html" %} +{% block body %} +
+

Adding a New Container

+ +

A "container" is a anything that groups publications together. For example, +a journal (eg, "New England Journal of Medicine"), conference proceedings, a +book series, or a blog. + +

Not all publications are in a container. + +

+ +

The Basics

+ +
+ + +
+ +
+ + +
+ + +
+ + +
+ + +
+ + +
+ + +
+ + +
+ + +
+ + +
+ + + + + + +

Anything Else?

+ +
Create container
+ +

Entity will be created as part of the current edit group, which needs to be +submited and approved before the entity will formally be included in the +catalog. + +

+ +
+{% endblock %} + +{% block postscript %} + +{% endblock %} diff --git a/python/fatcat_web/templates/container_view.html b/python/fatcat_web/templates/container_view.html new file mode 100644 index 00000000..c2ca7327 --- /dev/null +++ b/python/fatcat_web/templates/container_view.html @@ -0,0 +1,108 @@ +{% extends "base.html" %} +{% block fullbody %} + +
+
+
+

{{ container.name }} +
container {{ container.ident }}

+
+
+ +
+
+
+ +

Publisher: +{% if container.publisher != None %}{{ container.publisher }}{% else %}Unknown{% endif %} +{% if container.coden != None %} +
CODEN?:  {{ container.coden }} +{% endif %} +{% if container.abbrev != None %} +
Abbrev.:  {{ container.abbrev }} +{% endif %} +{% if (container.extra != None) and (container.extra['url'] != None) and (container.extra['url']|length > 0) %} +
Homepage:  {{ container.extra['url'] }} +{% endif %} +{% if container.wikidata_qid != None %} +
Wikidata Entity:  {{ container.wikidata_qid }} +{% endif %} + +{% if container.extra != None %} +

Extra Metadata (raw JSON)

+{% for (key, value) in container.extra.items() %} +{{ key }}: {{ value }}
+{% endfor %} +{% endif %} + + + +
+
+
+ +{% if container.extra.is_oa == True %} +Open Access Publisher +{% elif container.extra.is_oa == False %} +Not Open Access +{% else %} +Unknown OA Status +{% endif %} +
+ +{% if container.issnl != None %} + ISSN-L? +  {{ container.issnl }} + {% if container.extra != None and (container.extra.ISSNp|length > 0) %} +
Print:  {{ container.extra.ISSNp }} + {% endif %} + {% if container.extra != None and (container.extra.ISSNe|length > 0) %} +
Electronic:  {{ container.extra.ISSNe }} + {% endif %} +
+{% endif %} + + +Directory Listings
+{% if (container.extra != None) %} + {% if container.extra.in_doaj == True %} + In DOAJ
+ {% elif container.extra.in_doaj == False %} + Not in DOAJ
+ {% endif %} + {% if container.extra.in_road == True %} + In ISSN ROAD
+ {% elif container.extra.in_road == False %} + Not in ISSN ROAD
+ {% endif %} + {% if container.extra.is_kept == True %} + In Keepers Registery
+ {% elif container.extra.is_kept == False %} + Not in Keepers Registry
+ {% endif %} +{% endif %} +
+ +Lookup Links +
SHERPA/RoMEO (access policies) +
wikidata.org +
+ +Fatcat Bits +

State is "{{ container.state }}". Revision: +
{{ container.revision }} +
As JSON object via API + +

+ + +
+
+ +{% endblock %} diff --git a/python/fatcat_web/templates/creator_view.html b/python/fatcat_web/templates/creator_view.html new file mode 100644 index 00000000..2ce01fb6 --- /dev/null +++ b/python/fatcat_web/templates/creator_view.html @@ -0,0 +1,82 @@ +{% extends "base.html" %} +{% block fullbody %} + +
+
+
+

{{ creator.display_name }} +
creator {{ creator.ident }}

+
+
+ +
+
+
+ +

Given ("first") name: + {% if creator.given_name != None %}{{ creator.given_name}}{% else %}None or unknown{% endif %} +

Sur ("family"/"last") name: + {% if creator.surname != None %}{{ creator.surname }}{% else %}None or unknown{% endif %} + +{% if creator.wikidata_qid != None %} +
Wikidata Entity:  {{ creator.wikidata_qid }} +{% endif %} +{% if creator.extra != None %} +

Extra Metadata (raw JSON)

+{% for (key, value) in creator.extra.items() %} +{{ key }}: {{ value }}
+{% endfor %} +{% endif %} + +
+

Releases

+{% if releases != [] %} +

This creator has contributed to: +

    + {% for release in releases %} +
  • "{{ release.title }}", a {{ release.release_type }} published {{ release.release_date }} + {% if release.release_status != None %}(status: {{ release.release_status }}){% endif %}. +
    Fatcat ID: {{ release.ident }} + {% endfor %} +
+{% else %} +This creator has not contributed to any releases. +{% endif %} + + + +
+
+
+ +{% if creator.orcid != None %} + ORCID?: +  {{ creator.orcid }} +
+{% endif %} + +Lookup Links +
wikidata.org +
VIAF +
dblp (CS) +
Google Scholar +
+ +Fatcat Bits +

State is "{{ creator.state }}". Revision: +
{{ creator.revision }} +
As JSON object via API + +

+ + +
+
+ +{% endblock %} diff --git a/python/fatcat_web/templates/editgroup_view.html b/python/fatcat_web/templates/editgroup_view.html new file mode 100644 index 00000000..ac3228b0 --- /dev/null +++ b/python/fatcat_web/templates/editgroup_view.html @@ -0,0 +1,54 @@ +{% extends "base.html" %} +{% block body %} + +{# extended by changelog_entry #} +{% block editgroupheader %} +

Edit Group +
editgroup {{ editgroup.id }}

+{% endblock %} + +{# TODO:

Editor: {{ editgroup.editor.username }} #} +
Editor: {{ editgroup.editor_id }} +
Description: {{ editgroup.description }} + +

Work Edits ({{ editgroup.edits.works|count }})

+ + +

Release Edits ({{ editgroup.edits.releases|count }})

+ + +

Container Edits ({{ editgroup.edits.containers|count }})

+ + +

Creator Edits ({{ editgroup.edits.creators|count }})

+ + +

File Edits ({{ editgroup.edits.files|count }})

+ + +{% endblock %} diff --git a/python/fatcat_web/templates/editor_changelog.html b/python/fatcat_web/templates/editor_changelog.html new file mode 100644 index 00000000..79127312 --- /dev/null +++ b/python/fatcat_web/templates/editor_changelog.html @@ -0,0 +1,29 @@ +{% extends "base.html" %} +{% block body %} + +

Editor Changelog: {{ editor.username }} + +

+ +

Changes accepted (aka, merged editgroups): + + + {% for entry in changelog_entries %} +
Changelog
Index +
Timestamp (UTC) + Editgroup + Editor + Description +
{{ entry.index }} + {{ entry.timestamp }} + {{ entry.editgroup_id }} + {{ entry.editgroup.editor_id }} + {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} + {% endfor %} +
+ +{% endblock %} diff --git a/python/fatcat_web/templates/editor_view.html b/python/fatcat_web/templates/editor_view.html new file mode 100644 index 00000000..c9b61f5d --- /dev/null +++ b/python/fatcat_web/templates/editor_view.html @@ -0,0 +1,12 @@ +{% extends "base.html" %} +{% block body %} + +

{{ editor.username }} +
+ editor {{ editor.id }} +
+

+ +

View editor's changelog + +{% endblock %} diff --git a/python/fatcat_web/templates/entity_edit.html b/python/fatcat_web/templates/entity_edit.html new file mode 100644 index 00000000..5da98d89 --- /dev/null +++ b/python/fatcat_web/templates/entity_edit.html @@ -0,0 +1,8 @@ +{% extends "base.html" %} +{% block body %} + +

Not Implemented

+ +Entity editing isn't implemented yet, only creation. Sorry! + +{% endblock %} diff --git a/python/fatcat_web/templates/entity_history.html b/python/fatcat_web/templates/entity_history.html new file mode 100644 index 00000000..54577b2f --- /dev/null +++ b/python/fatcat_web/templates/entity_history.html @@ -0,0 +1,30 @@ +{% extends "base.html" %} +{% block body %} + +

{% if page_title != None %}{{ page_title }}{% endif %} + +

+ +

Fatcat Metadata Edit History

+ + + + {% for entry in history %} +
Changelog
Index +
Timestamp (UTC) + Editgroup + Editor + Description +
{{ entry.changelog_entry.index }} + {{ entry.changelog_entry.timestamp }} + {{ entry.editgroup.id }} + {{ entry.editgroup.editor_id }} + {% if entry.editgroup.description != None %}{{ entry.editgroup.description }}{% endif %} + {% endfor %} +
+ +{% endblock %} diff --git a/python/fatcat_web/templates/file_view.html b/python/fatcat_web/templates/file_view.html new file mode 100644 index 00000000..74977668 --- /dev/null +++ b/python/fatcat_web/templates/file_view.html @@ -0,0 +1,108 @@ +{% extends "base.html" %} +{% block fullbody %} + +
+
+
+

+
file {{ file.ident }}

+
+
+ +
+
+
+ +{% if file.extra != None %} +

Extra Metadata (raw JSON)

+{% for (key, value) in file.extra.items() %} +{{ key }}: {{ value }}
+{% endfor %} +{% endif %} + +

Releases

+{% if file.releases != None %} +

Releases associated with this file: +

+{% else %} +This file is not associated with any fatcat release. +{% endif %} + +

URLs

+{% if file.url != None %} +

Known locations of this file: +

+{% else %} +No known public URL, mirror, or archive for this file. +{% endif %} + +

Checksums

+ + + + {% if file.sha1 != None %} +
Algorithm + Value +
SHA-1 + {{ file.sha1 }} + {% endif %} + {% if file.sha256 != None %} +
SHA-256 + {{ file.sha256 }} + {% endif %} + {% if file.md5!= None %} +
MD5 + {{ file.md5 }} + {% endif %} +
+ + + +
+
+ +{% if file.urls != None and file.urls != [] %} +Download File +{% else %} +No Download Available +{% endif %} + +
+ +{% if file.size != None %} +

Size  {{ file.size }} (bytes) +

+{% endif %} + +{% if file.mimetype != None %} +

File Type  {{ file.mimetype }} +

+{% endif %} + +Fatcat Bits +

State is "{{ file.state }}". Revision: +
{{ file.revision }} +
As JSON object via API + +

+ + +
+
+ + +{% endblock %} diff --git a/python/fatcat_web/templates/home.html b/python/fatcat_web/templates/home.html new file mode 100644 index 00000000..4d3b44a1 --- /dev/null +++ b/python/fatcat_web/templates/home.html @@ -0,0 +1,91 @@ +{% extends "base.html" %} +{% block body %} +
+ +

Welcome to fatcat!

+ + + +
+
Current Status: Prototype
+
    +
  • No authentication or accounts +
  • Any edits will be lost +
  • Most creation/edit forms don't work +
  • Any data was bulk-imported, and may not be up to date +
  • Search results are from Crossref, not local API/database +
+
+ +

This is versioned, user-editable catalog of research publications: journal +articles, conference proceedings, pre-prints, etc. Features include archival +file-level metadata (verified digests and long-term copies, in addition to +URLs), a documented API, and work/release +indexing (aka, linking together of pre-prints and final copies). +Read more... + +

+ + + + +
Entity + Actions + Examples + Lookup +
Release +
journal article, pre-print, book +
published version of a Work +
Create + Dummy +
Realistic +
+
+ + +
+
+
Container +
journal or serial +
Create + Dummy +
Realistic +
+
+ + +
+
+ +
Creator +
authors, editors, translators +
+ Dummy +
Realistic +
+
+ + +
+
+
File +
specific digital blobs (immutable) +
+ Dummy +
Realistic +
+
+ + +
+
+
Work +
for grouping Releases +
+ Dummy +
Realistic +
+
+ +

+{% endblock %} diff --git a/python/fatcat_web/templates/release_changelog.html b/python/fatcat_web/templates/release_changelog.html new file mode 100644 index 00000000..706a5642 --- /dev/null +++ b/python/fatcat_web/templates/release_changelog.html @@ -0,0 +1,17 @@ +{% extends "base.html" %} +{% block body %} + +

Release Changelog: {{ release.id }}

+ +

release: {{ release.id }} + +

Changelog: +

+ +{% endblock %} diff --git a/python/fatcat_web/templates/release_create.html b/python/fatcat_web/templates/release_create.html new file mode 100644 index 00000000..ac8a8169 --- /dev/null +++ b/python/fatcat_web/templates/release_create.html @@ -0,0 +1,215 @@ +{% extends "base.html" %} +{% block body %} +
+

Adding a New Thing

+ +
+ +

The Basics

+ +
+ + +
+ +
+ + +
+ + + + + +
+ +
+ + +
+
+ + +
+ + +
+ + +
+ + +
+ + +

Primary Release / Edition

+ + + + + + + + + + + + + +

Anything Else?

+ + + + +
Create Work
+
+ +
+{% endblock %} + +{% block postscript %} + +{% endblock %} diff --git a/python/fatcat_web/templates/release_search.html b/python/fatcat_web/templates/release_search.html new file mode 100644 index 00000000..c57ad149 --- /dev/null +++ b/python/fatcat_web/templates/release_search.html @@ -0,0 +1,64 @@ +{% extends "base.html" %} +{% block body %} + +

Article Search

+
+
+
+ + +
+
+ + +
+
+
+ +
+ +{% if found %} +{% if found.results %} + Showing top {{ found.count_returned }} out of {{ found.count_found }} results for: {{ found.query.q }} +{% for paper in found.results %} +
+

{{ paper['title'] }} + {% if paper.file_pdf_url %} +   fulltext + {% endif %} +

+
{{ ", ".join(paper.contrib_names) }}
+ {% if paper.doi %} + DOI: {{ paper.doi }} + - {{ paper.release_type }} + {% if paper.release_date %} + - {{ paper.release_date[:4] }} + {% endif %} +
+ {% endif %} + {% if paper.container_name %} + {% if paper.container_issnl %} + {{ paper.container_name }} + {% else %} + {{ paper.container_name }} + {% endif %} + {% if paper.container_is_oa %}{% endif %} + {% endif %} +
+{% endfor %} +{% else %} +
+

No results found!

+ Query was: {{ found.query.q }} +
+

Try:

+ +
+{% endif %} +{% endif %} + +{% endblock %} diff --git a/python/fatcat_web/templates/release_view.html b/python/fatcat_web/templates/release_view.html new file mode 100644 index 00000000..39dcf8fd --- /dev/null +++ b/python/fatcat_web/templates/release_view.html @@ -0,0 +1,290 @@ +{% extends "base.html" %} +{% block fullbody %} + +
+
+
+

{{ release.title }} +
release {{ release.ident }}

+

+ {% if authors != [] %} by {% endif %} + {% for contrib in authors %} + {% if contrib.creator_id %} + {{ contrib.raw_name }}{% if not loop.last %}, {% endif %} + {% else %} + {% if contrib.raw_name != None %}{{ contrib.raw_name }}{% else %}Unknown{% endif %}{% if not loop.last %}, {% endif %} + {% endif %} + {% endfor %} +

+
+ +
+
+
+ +{% if release.release_date != None %}

Date (published): {{ release.release_date }}{% endif %} +{% if release.pmid != None %} +
PubMed:  {{ release.pmid }} +{% endif %} +{% if release.pmcid != None %} +
PubMed Central:  {{ release.pmcid }} +{% endif %} +{% if release.wikidata_qid != None %} +
Wikidata Entity:  {{ release.wikidata_qid }} +{% endif %} +{% if release.language != None %} +
Primary Language:  {{ release.language }} (lookup ISO-639 code) +{% endif %} +
This {{ release.release_type or "unknown-type" }} is a release +(version) of the work  {{ +release.work_id }}. There may be other releases (eg, pre-prints, +formal publications, etc) linked to the same work. + +{% if container != None %} +

+
+ + {% if release.release_status == 'published' %} + Published in {{ container.name }} + {% else %} + Released in {{ release.release_type }} in {{ container.name }} + {% endif %} + {% if release.publisher %} + by {{ release.publisher }} + {% endif %} +
+ {% if container != None and container.issnl != None %}ISSN-L: {{ container.issnl }}
{% endif %} + {% if release.volume != None %}Volume: {{ release.volume }}
{% endif %} + {% if release.issue != None %}Issue: {{ release.issue }}
{% endif %} + {% if release.pages != None %}Page(s): {{ release.pages }}
{% endif %} + {% if release.publisher != None %}Publisher: {{ release.publisher }}
{% endif %} + {% if release.release_status != None %}Release Status: {{ release.release_status }}
{% endif %} + {% if release.release_type != None %}Release Type: {{ release.release_type}}
{% endif %} +
+
+{% endif %} + +{% if release.extra != None %} +

Extra Metadata (raw JSON)

+{% for (key, value) in release.extra.items() %} +{{ key }}: {% if key == "crossref" %} <truncated, see full JSON>{% else %} {{ value }} {% endif %}
+{% endfor %} +{% endif %} + + + +
+

Abstracts

+{% if release.abstracts != [] %} + {% for abstract in release.abstracts %} + Abstract ({{ abstract.sha1 }}, {{ abstract.mimetype }}): {{ abstract.content }} + {% endfor %} +{% else %} +

No known abstracts. +{% endif %} + +
+

All Contributors

+{% if release.contribs.size != 0 %} + + + + + {% for contrib in release.contribs %} + +
Attribution Order + Name + Role +
{% if contrib.index or contrib.index == 0 %} {{ contrib.index + 1 }}{% endif %} + {% if contrib.creator_id %} + {{ contrib.raw_name }} + {% else %} + {{ contrib.raw_name }} + {% endif %} + {{ contrib.role or '' }} + {% endfor %} +
+{% else %} +

Contributors (authors, translators, etc) not known. +{% endif %} + +
+

Known Files and URLs

+{% if files != [] %} + + + + + {% for file in files %} + +
SHA-1 + Size (bytes) + File Type + Links +
{{ file.sha1[:16] + "..." }} + {% if file.size != None %}{{ file.size }}{% endif %} + {% if file.mimetype != None %}{{ file.mimetype }}{% endif %} + {% for url in file.urls %} + {{ url.url.split('/')[2] }} ({{ url.rel }})
+ {% endfor %} + {% endfor %} +
+ +{% else %} +

There are no known files associated with this release (you could try +other releases for this work?). +{% endif %} + +
+{% if release.refs.size != 0 %} +

References

+This release citing other releases. +
    + {% for ref in release.refs %} +
  1. + {% if ref.title %} + {{ ref.title }} + {% if ref.container_name %}{{ ref.container_name }}.{% endif %} + {% if ref.year %}{{ ref.year }}{% endif %} + {% if ref.locator %}{{ ref.locator }}{% endif %} + {% elif ref.extra and ref.extra.crossref %} + {% if ref.extra.crossref.get('author') %}{{ ref.extra.crossref['author'] }}.{% endif %} + {% if ref.extra.crossref.get('article-title') %}{{ ref.extra.crossref['article-title'] }}.{% endif %} + {% if ref.container_name %}{{ ref.container_name }}.{% endif %} + {% if ref.year %}{{ ref.year }}.{% endif %} + {% elif ref.extra and ref.extra.unstructured %} + {{ ref.extra.unstructured }} + {% else %} + unknown + {% endif %} + {% if ref.target_release_id != None %} + (fatcat release) +{# {% elif ref.extra != None and ref.extra.doi != None %} + (DOI: {{ ref.extra.get('doi') }}) #} + {% endif %} + {% endfor %} +
+{% else %} +

No reference list available. +{% endif %} + +

+
+ +{% if files != [] and files[0].urls != [] %} +Download Full Text +{% else %} +No Full Text Available +{% endif %} + +{% if release.release_type != None %} +
+Release Type {{ release.release_type }} +
+{% endif %} + +{% if release.doi %} + +{% endif %} +{% if release.isbn13 != None %} +
+

ISBN-13  {{ release.isbn13 }} +

+{% endif %} + +{% if release.extra.is_oa == True %} +
+ Open Access +
+{% elif release.extra.is_oa == False %} + Not Open Access +
+{% endif %} + +{% if container != None %} +
+Container Metadata
+{% if container.extra.is_oa == True %} +Open Access Publication
+{% elif container.extra.is_oa == False %} +Not Open Access
+{% else %} +Unknown OA Status
+{% endif %} +{% if (container.extra != None) %} + {% if container.extra.in_doaj == True %} + In DOAJ
+ {% elif container.extra.in_doaj == False %} + Not in DOAJ
+ {% endif %} + {% if container.extra.in_road == True %} + In ISSN ROAD
+ {% elif container.extra.in_road == False %} + Not in ISSN ROAD
+ {% endif %} + {% if container.extra.is_kept == True %} + In Keepers Registery
+ {% elif container.extra.is_kept == False %} + Not in Keepers Registry
+ {% endif %} +{% endif %} +{% if container.issnl != None %} + ISSN-L:  {{ container.issnl }}
+{% endif %} + Fatcat:  {{ container.ident }}
+
+{% endif %} + +
+
Lookup Links
+
+{% if container != None and container.issnl != None %} + SHERPA/RoMEO (journal policies)
+{% endif %} +{% if container != None and container.doi != None %} + oaDOI/unpaywall
+{% endif %} +{% if release.isbn13 != None %} + Open Library
+ Worldcat
+{% else %} + Worldcat
+{% endif %} +{% if release.doi %} +Crossref Metadata (via API)
+{% endif %} +wikidata.org
+CORE.ac.uk
+Semantic Scholar (CS, neuro)
+Google Scholar
+
+
+ +
+Fatcat Bits +

State is "{{ release.state }}". Revision: +
{{ release.revision }} +
As JSON object via API + +

+ + +
+
+{% endblock %} + +{% block postscript %} + +{% endblock %} diff --git a/python/fatcat_web/templates/stats.html b/python/fatcat_web/templates/stats.html new file mode 100644 index 00000000..6a37dcee --- /dev/null +++ b/python/fatcat_web/templates/stats.html @@ -0,0 +1,104 @@ +{% extends "base.html" %} +{% block body %} + +

Entity Statistics

+ +
+
+ {{ stats.entity_counts.work }} +
+
+ Works +
+
+ +
+ +
+
+ {{ stats.entity_counts.release }} +
+
+ Releases +
+
+ +
+
+ {{ stats.releases_with_dois }} +
+
+ ... with DOIs +
+
+ +
+
+ {{ stats.releases_with_dois }} +
+
+ ... with a File +
+
+ +
+ +
+
+ {{ stats.entity_counts.container }} +
+
+ Containers +
+
+ +
+
+ {{ stats.containers_with_issnls }} +
+
+ ... with an ISSN-L +
+
+ +
+ +
+
+ {{ stats.entity_counts.creator }} +
+
+ Creators +
+
+ +
+
+ {{ stats.creators_with_orcids }} +
+
+ ... with an ORCID +
+
+ +
+ +
+
+ {{ stats.entity_counts.file }} +
+
+ Files +
+
+ +
+
+ {{ stats.files_with_releases }} +
+
+ ... with a Release +
+
+ +{% endblock %} diff --git a/python/fatcat_web/templates/work_view.html b/python/fatcat_web/templates/work_view.html new file mode 100644 index 00000000..87120e63 --- /dev/null +++ b/python/fatcat_web/templates/work_view.html @@ -0,0 +1,72 @@ +{% extends "base.html" %} +{% block fullbody %} + +
+
+
+

+
work {{ work.ident }}

+
+
+ +
+
+
+ +{% if work.extra != None %} +

Extra Metadata (raw JSON)

+{% for (key, value) in work.extra.items() %} +{{ key }}: {{ value }}
+{% endfor %} +{% endif %} + + + +

A "work" is just a linking identifier between a set of releases. For +example, a pre-print and a published article may contain small differences, but +still reference the same underlying "work". + +
+ +{% if releases != [] %} +

    + {% for release in releases %} +
  • "{{ release.title }}", a {{ release.release_type }} published {{ release.release_date }} as {{ release.release_status }}. +
    {{ release.ident }} + {% endfor %} +
+{% else %} +

There are no known releases associated with this work. +{% endif %} + + +

+
+
+ +Work Type: +{% if work.work_type != None %} + {{ work.work_type }} +{% else %} +unkonwn +{% endif %} +
+ +Fatcat Bits +

State is "{{ work.state }}". Revision: +
{{ work.revision }} +
As JSON object via API + +

+ + +
+
+ +{% endblock %} diff --git a/python/webface_config.py b/python/webface_config.py new file mode 100644 index 00000000..3d6db049 --- /dev/null +++ b/python/webface_config.py @@ -0,0 +1,20 @@ + +import os +import subprocess + +basedir = os.path.abspath(os.path.dirname(__file__)) + +class Config(object): + SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URI') or \ + 'sqlite:///' + os.path.join(basedir, 'fatcat_dev.sqlite') + SQLALCHEMY_TRACK_MODIFICATIONS = False + GIT_REVISION = subprocess.check_output(["git", "describe", "--always"]).strip() + # This is, effectively, the QA/PROD flag + FATCAT_DOMAIN = "qa.fatcat.wiki" + ELASTIC_BACKEND = "https://search.fatcat.wiki" + ELASTIC_INDEX = "fatcat" + + # "Event more verbose" debug options. SECRET_KEY is bogus. + #SQLALCHEMY_ECHO = True + #SECRET_KEY = "kuhy0284hflskjhg01284" + #DEBUG = True -- cgit v1.2.3