diff options
author | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 19:35:23 -0700 |
---|---|---|
committer | Bryan Newbold <bnewbold@archive.org> | 2018-08-23 19:36:23 -0700 |
commit | 98e67e291132b10a0ca698ad4ff754acc0c22121 (patch) | |
tree | 940d436787c14548eacf6b86d3670a9f3ef3b1ad /scalding/src/main | |
parent | 715a35715609d8cbacff53dd5c7c1715c53a55f8 (diff) | |
download | sandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.tar.gz sandcrawler-98e67e291132b10a0ca698ad4ff754acc0c22121.zip |
add a content-type filter for crossref works
Diffstat (limited to 'scalding/src/main')
-rw-r--r-- | scalding/src/main/scala/sandcrawler/CrossrefScorable.scala | 18 |
1 files changed, 17 insertions, 1 deletions
diff --git a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala index baa1ca9..039fa85 100644 --- a/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala +++ b/scalding/src/main/scala/sandcrawler/CrossrefScorable.scala @@ -26,6 +26,21 @@ class CrossrefScorable extends Scorable with HBasePipeConversions { } object CrossrefScorable { + + val ContentTypeWhitelist: Set[String] = Set( + "book", + "book-chapter", + "dataset", + "dissertation", + "journal-article", + "letter", + "monograph", + "posted-content", + "pre-print", + "proceedings-article", + "report", + "working-paper") + def keepRecord(json : String) : Boolean = { Scorable.jsonToMap(json) match { case None => false @@ -90,7 +105,8 @@ object CrossrefScorable { val doi = Scorable.getString(map, "DOI") val authors: List[String] = mapToAuthorList(map) val year: Int = mapToYear(map).getOrElse(0) - if (doi.isEmpty || doi == null || authors.length == 0) { + val contentType: String = map.get("type").map(e => e.asInstanceOf[String]).getOrElse("MISSING-CONTENT-TYPE") + if (doi.isEmpty || doi == null || authors.length == 0 || !(ContentTypeWhitelist contains contentType)) { MapFeatures(Scorable.NoSlug, json) } else { val sf : ScorableFeatures = ScorableFeatures.create(title=title, authors=authors, doi=doi.toLowerCase(), year=year) |