aboutsummaryrefslogtreecommitdiffstats
diff options
context:
space:
mode:
authorMartin Czygan <martin.czygan@gmail.com>2021-04-25 17:56:03 +0200
committerMartin Czygan <martin.czygan@gmail.com>2021-04-25 17:56:03 +0200
commitfaea941faff358802a8950af54d05ac88264b8f3 (patch)
tree6521b7adfd6fb271716f58d87d8ad06d8c6c0b31
parent1cf6d135f274ce79b09a0396367186132bc178f3 (diff)
downloadrefcat-faea941faff358802a8950af54d05ac88264b8f3.tar.gz
refcat-faea941faff358802a8950af54d05ac88264b8f3.zip
wip: partial document stub
-rw-r--r--skate/map.go36
1 files changed, 30 insertions, 6 deletions
diff --git a/skate/map.go b/skate/map.go
index 571a297..9cad1e4 100644
--- a/skate/map.go
+++ b/skate/map.go
@@ -19,14 +19,24 @@ type TitleDoc struct {
Title string `json:"title"`
}
-// Mapper converts a blob.
-type Mapper func([]byte) ([]byte, error)
+// PartialDoc for docs, that do not have DOI or title. E.g. we found 49701699
+// (NCVY), 36401044 (NCVYU), 29668363 (NCUY), and so on.
+type PartialDoc struct {
+ ContainerName string `json:"container_name"`
+ Contribs []struct {
+ RawName string `json:"raw_name"`
+ } `json:"contribs"`
+ Volume string `json:"volume"`
+ Unstructured string `json:"unstructured"`
+ Year string `json:"release_year"`
+}
// FieldMapper maps a blob to an arbitrary number of fields, e.g. for (key,
-// doc) etc.
+// doc). We want fields, but we do not want to bake in TSV into each function.
type FieldMapper func([]byte) ([][]byte, error)
-// TSV serialized the result of a field mapper as TSV.
+// TSV serialized the result of a field mapper as TSV. This is a slim adapter,
+// e.g. to parallel.Processor, which expects this function signature.
func (f FieldMapper) TSV(p []byte) ([]byte, error) {
fields, err := f(p)
if err != nil {
@@ -46,11 +56,14 @@ func NameOf(f interface{}) string {
return v.String()
}
+// Identifier returns just the input again.
func Identity(p []byte) ([][]byte, error) {
return [][]byte{p}, nil
}
-func CreateFixedMapper(path string) FieldMapper {
+// CreateFixedMapper extract the value from a given fixed top level json key.
+// Returns a function that maps doc to (v, doc).
+func CreateFixedMapper(field string) FieldMapper {
f := func(p []byte) ([][]byte, error) {
var (
doc map[string]interface{}
@@ -61,7 +74,7 @@ func CreateFixedMapper(path string) FieldMapper {
if err := json.Unmarshal(p, &doc); err != nil {
return nil, err
}
- if v, ok = doc[path]; !ok {
+ if v, ok = doc[field]; !ok {
return nil, nil
}
switch w := v.(type) {
@@ -81,6 +94,7 @@ func CreateFixedMapper(path string) FieldMapper {
return f
}
+// MapperTitle extracts (title, doc).
func MapperTitle(p []byte) ([][]byte, error) {
var (
doc TitleDoc
@@ -94,6 +108,7 @@ func MapperTitle(p []byte) ([][]byte, error) {
return [][]byte{key, p}, nil
}
+// MapperTitleNormalized extracts (title normalized, doc).
func MapperTitleNormalized(p []byte) (fields [][]byte, err error) {
if fields, err = MapperTitle(p); err != nil {
return nil, err
@@ -107,6 +122,7 @@ func MapperTitleNormalized(p []byte) (fields [][]byte, err error) {
return fields, nil
}
+// MapperTitleNormalized extracts (title nysiis, doc).
func MapperTitleNysiis(p []byte) (fields [][]byte, err error) {
if fields, err = MapperTitle(p); err != nil {
return nil, err
@@ -118,6 +134,7 @@ func MapperTitleNysiis(p []byte) (fields [][]byte, err error) {
return fields, nil
}
+// MapperTitleSandcrawler extracts (title sandcrawler, doc).
func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) {
if fields, err = MapperTitle(p); err != nil {
return nil, err
@@ -127,3 +144,10 @@ func MapperTitleSandcrawler(p []byte) (fields [][]byte, err error) {
fields[0] = []byte(key)
return fields, nil
}
+
+// MapperPartial works on partial documents.
+func MapperPartial(p []byte) (fields [][]byte, err error) {
+ // XXX: slugify authors, how to compare two author strings? How do these
+ // things look like?
+ return nil, nil
+}