From c133f3077aa975aa4706a8e5ca894fc1b71fbc67 Mon Sep 17 00:00:00 2001 From: Bryan Newbold Date: Mon, 10 Aug 2020 17:27:26 -0700 Subject: datacite import: store less subject metadata Many of these 'subject' objects have the equivalent of several lines of text, with complex URLs that don't compress well. I think it is fine we have included these thus far instead of parsing more deeply, but going forward I don't think this nested 'extra' metadata is worth the database space. --- python/fatcat_tools/importers/datacite.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/python/fatcat_tools/importers/datacite.py b/python/fatcat_tools/importers/datacite.py index d4d7a9f5..fe02cac4 100644 --- a/python/fatcat_tools/importers/datacite.py +++ b/python/fatcat_tools/importers/datacite.py @@ -597,7 +597,13 @@ class DataciteImporter(EntityImporter): if license_extra: extra_datacite["license"] = license_extra if attributes.get("subjects"): - extra_datacite["subjects"] = attributes["subjects"] + # these subjects with schemeUri are too much metadata, which + # doesn't compress. filter them out. + extra_subjects = [ + subj for subj in attributes["subjects"] if not subj.get("schemeUri") + ] + if extra_subjects: + extra_datacite["subjects"] = extra_subjects # Include version information. metadata_version = attributes.get("metadataVersion") or "" -- cgit v1.2.3