package skate import ( "bytes" "crypto/sha1" "encoding/gob" "fmt" "regexp" "strconv" "strings" "sync" "time" "git.archive.org/martin/cgraph/skate/isbn" "git.archive.org/martin/cgraph/skate/set" ) var ( // Various ISBN patterns tailored towards more dirty values, e.g. with // artifacts from OCR, etc. isbn10Regex = regexp.MustCompile(`[O0-9xX -]{10,18}`) isbn13Regex = regexp.MustCompile(`9[O0-9xX -]{12,20}`) // openLibraryDateLayouts, e.g. as found in Open Library Editions, // .publish_date. openLibraryDateLayouts = []string{ "1 Jan 2006", "2006", "2006-02", "2006-02-01", "Jan 2, 2006", "Jan 2006", } rune16pool = sync.Pool{ New: func() interface{} { return make([]rune, 0, 16) }, } setPool = sync.Pool{ New: func() interface{} { return set.New() }, } ) // Ref is a reference document (derived from "heavy intermediate schema"), can // be very partial. type Ref struct { Biblio Biblio `json:"biblio"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator:omitempty"` RefSource string `json:"ref_source,omitempty"` ReleaseYear int `json:"release_year,omitempty"` ReleaseIdent string `json:"release_ident,omitempty"` ReleaseStage string `json:"release_stage,omitempty"` WorkIdent string `json:"work_ident,omitempty"` } // Biblio contains bibliographic information for a Ref. type Biblio struct { ArxivId string `json:"arxiv_id,omitempty"` ContainerName string `json:"container_name,omitempty"` ContribRawNames []string `json:"contrib_raw_names,omitempty"` DOI string `json:"doi,omitempty"` Issue string `json:"issue,omitempty"` PMCID string `json:"pmcid,omitempty"` PMID string `json:"pmid,omitempty"` Pages string `json:"pages,omitempty"` Publisher string `json:"publisher,omitempty"` Title string `json:"title,omitempty"` Unstructured string `json:"unstructured,omitempty"` Url string `json:"url,omitempty"` Volume string `json:"volume,omitempty"` Year int64 `json:"year,omitempty"` // More non-standard fields go into extra. Extra struct { ISBN []string `json:"isbn"` } `json:"extra"` } // RefToRelease converts a ref to a release. func RefToRelease(ref *Ref) (*Release, error) { var ( release Release b = ref.Biblio contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(b.ContribRawNames)) ) release.Ident = ref.ReleaseIdent release.WorkID = ref.WorkIdent release.ExtIDs.Arxiv = b.ArxivId release.ExtIDs.DOI = b.DOI release.ExtIDs.PMID = b.PMID release.ExtIDs.PMCID = b.PMCID release.Title = b.Title release.Publisher = b.Publisher release.ContainerName = b.ContainerName release.Volume = b.Volume release.Issue = b.Issue release.Pages = b.Pages if ref.ReleaseYear > 1000 { release.ReleaseYearValue = fmt.Sprintf("%d", ref.ReleaseYear) } for i, name := range b.ContribRawNames { contribs[i].Index = i contribs[i].RawName = name } release.Contribs = contribs if strings.Contains(strings.ToLower(ref.Biblio.Unstructured), "isbn") { release.ExtIDs.ISBN = ParseIsbn(ref.Biblio.Unstructured) } // Extra info stashed into extra.skate.ref. release.Extra.Skate.Ref.Index = ref.Index release.Extra.Skate.Ref.Key = ref.Key release.Extra.Skate.Ref.Locator = ref.Locator release.Extra.Skate.Ref.Source = ref.RefSource return &release, nil } // ReleaseToUnstructured tries to render a sensible string, e.g. for frontend // display of unmatched and other relations. Some examples: // https://guides.lib.uw.edu/c.php?g=341448&p=4076094 No specific style, just // try to be readable. func ReleaseToUnstructured(r *Release) string { var ( buf bytes.Buffer names = make([]string, len(r.Contribs)) ) for i := 0; i < len(r.Contribs); i++ { names[i] = r.Contribs[i].RawName } fmt.Fprintf(&buf, "%s", strings.Join(names, ", ")) if r.Title != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ". ") } fmt.Fprintf(&buf, `%s`, r.Title) } if len(r.Subtitle()) > 0 { fmt.Fprintf(&buf, ": %s", strings.Join(r.Subtitle(), " ")) } if r.ContainerName != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ". ") } fmt.Fprintf(&buf, `%s`, r.ContainerName) } if r.Volume != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `vol. %s`, r.Volume) } if r.Issue != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `no. %s`, r.Issue) } if r.ReleaseYear() > 0 { if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `%s`, r.ReleaseYearString()) } if r.Pages != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `pp. %s`, r.Pages) } if r.Publisher != "" { if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `%s`, r.Publisher) } switch { case r.ExtIDs.DOI != "": if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } fmt.Fprintf(&buf, `%s`, r.ExtIDs.DOI) case len(r.ExtIDs.ISBN) > 0: if buf.Len() > 0 { fmt.Fprintf(&buf, ", ") } if isbn13, err := isbn.To13(r.ExtIDs.ISBN[0]); err == nil { fmt.Fprintf(&buf, `%s`, isbn13) } else { fmt.Fprintf(&buf, `%s`, r.ExtIDs.ISBN[0]) } } return buf.String() } // ParseIsbn tries to find and validate ISBN from unstructured data. Returns a // list of unique, unsorted and validated ISBN13, e.g. 9780123838520. func ParseIsbn(s string) []string { // ISBN: 10: 0137822693, pp: 373 // Robotec, E. (1996). Scorbot ER VII, User's Manual, Eshed Robotec, // ISBN9652910333. Shannon, C. (1948). A Mathematical Theory of // Communication. The Bell System Technical Journal. July; October, // Vol. 27, pp. 379-423; 623-656. // Artech House, ISBN: 978-1-60807-201-9, 2011. // ... var ( candidates10 = isbn10Regex.FindAllString(s, -1) candidates13 = isbn13Regex.FindAllString(s, -1) u []rune z string err error ) valid := setPool.Get().(set.Set) valid.Clear() defer setPool.Put(valid) for _, v := range append(candidates10, candidates13...) { u = rune16pool.Get().([]rune) u = u[:0] for _, c := range v { if c == 'O' { c = '0' } if c >= '0' && c <= '9' || c == 'X' || c == 'x' { u = append(u, c) } } z = string(u) rune16pool.Put(u) if !isbn.Validate(z) { continue } if len(z) < 12 { if z, err = isbn.To13(z); err != nil { continue } } valid.Add(z) } return valid.Slice() } // Release document. Note that we may have different types for some fields // (e.g. string, int, etc.). Mitigation for now is to make the field an // interface{}, name the field "...Value" and to add a method with the field // name, doing type assertion. Example: ReleaseYearValue interface{}, // ReleaseYear() int, etc. // // Extra field gets a section for "skate" for conversion related values. type Release struct { ContainerID string `json:"container_id,omitempty"` ContainerName string `json:"container_name,omitempty"` // when not resolved Container struct { ContainerType string `json:"container_type"` Ident string `json:"ident"` Issnl string `json:"issnl"` Name string `json:"name"` Publisher string `json:"publisher"` Revision string `json:"revision"` State string `json:"state"` WikidataQid string `json:"wikidata_qid"` } `json:"container"` Contribs []struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` } `json:"contribs,omitempty"` ExtIDs struct { Arxiv string `json:"arxiv,omitempty"` Core string `json:"core,omitempty"` DOI string `json:"doi,omitempty"` ISBN []string `json:"isbn,omitempty"` // should be isbn13 Jstor string `json:"jstor,omitempty"` OLID string `json:"olid,omitempty"` PMCID string `json:"pmcid,omitempty"` PMID string `json:"pmid,omitempty"` WikidataQID string `json:"wikidata_qid,omitempty"` } `json:"ext_ids,omitempty"` Ident string `json:"ident,omitempty"` Publisher string `json:"publisher,omitempty"` Refs []struct { ContainerName string `json:"container_name,omitempty"` Extra struct { DOI string `json:"doi,omitempty"` Authors []string `json:"authors,omitempty"` Key string `json:"key,omitempty"` Year string `json:"year,omitempty"` Locator string `json:"locator,omitempty"` Volume string `json:"volume,omitempty"` } `json:"extra"` Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` Year int64 `json:"year,omitempty"` } `json:"refs,omitempty"` ReleaseDate string `json:"release_date,omitempty"` ReleaseYearValue interface{} `json:"release_year,omitempty"` // might be int or str ReleaseStage string `json:"release_stage,omitempty"` ReleaseType string `json:"release_type,omitempty"` Issue string `json:"issue,omitempty"` Volume string `json:"volume,omitempty"` Pages string `json:"pages,omitempty"` Title string `json:"title,omitempty"` WorkID string `json:"work_id,omitempty"` Extra struct { ContainerName string `json:"container_name,omitempty"` SubtitleValue interface{} `json:"subtitle,omitempty"` // []str or str Crossref struct { Type string `json:"type,omitempty"` } `json:"crossref,omitempty"` DataCite struct { MetadataVersion int `json:"metadataVersion,omitempty"` Relations []DataCiteRelation `json:"relations,omitempty"` } `json:"datacite,omitempty"` Skate struct { // Mark as converted from "ref", "rg" or other schemas. Status string `json:"status,omitempty"` // Carry the ref index and key around. Ref struct { Index int64 `json:"index,omitempty"` Key string `json:"key,omitempty"` Locator string `json:"locator,omitempty"` Source string `json:"source,omitempty"` } `json:"ref,omitempty"` ResearchGate struct { URL string `json:"url,omitempty"` } `json:"rg,omitempty"` ResolvedContainerName string `json:"resolved_container_name"` } `json:"skate,omitempty"` OpenLibrary struct { HasFulltext bool `json:"has_fulltext,omitempty"` WorkID string `json:"work,omitempty"` SourceRecords []string `json:"source_records,omitempty"` } `json:"ol,omitempty"` } `json:"extra,omitempty"` } // Subtitle returns a slice of subtitle strings. func (r *Release) Subtitle() (result []string) { switch v := r.Extra.SubtitleValue.(type) { case []interface{}: for _, e := range v { result = append(result, fmt.Sprintf("%v", e)) } return result case []string: return v case string: return []string{v} } return []string{} } // ReleaseYearString returns release year as string. func (r *Release) ReleaseYearString() string { return fmt.Sprintf("%d", r.ReleaseYear()) } // ReleaseYear returns year as int, no further validity checks. func (r *Release) ReleaseYear() int { switch v := r.ReleaseYearValue.(type) { case int: return v case float64: return int(v) case string: w, err := strconv.Atoi(v) if err != nil { return 0 } return w default: return 0 } } // DataCiteRelation as it appears in the release extra field. type DataCiteRelation struct { RelatedIdentifierType string `json:"relatedIdentifierType,omitempty"` RelatedIdentifierValue interface{} `json:"relatedIdentifier,omitempty"` } // RelatedIdentifier returns the identifier as string. func (r *DataCiteRelation) RelatedIdentifier() string { switch v := r.RelatedIdentifierValue.(type) { case string: return v default: return fmt.Sprintf("%v", v) } } // SitemapEntry in a basic JSON style, e.g. for https://archive.org/details/rg_sitemap_2021_02_23. type SitemapEntry struct { Lastmod string `json:"lastmod,omitempty"` Title string `json:"title,omitempty"` URL string `json:"url,omitempty"` } // BiblioRef as a prototype for citation graph elasticsearch indexing, // https://is.gd/yicTom. type BiblioRef struct { Key string `json:"_id,omitempty"` IndexedTs string `json:"indexed_ts,omitempty"` // https://www.elastic.co/guide/en/elasticsearch/reference/current/date.html SourceReleaseIdent string `json:"source_release_ident,omitempty"` SourceWorkIdent string `json:"source_work_ident,omitempty"` SourceWikipediaArticle string `json:"source_wikipedia_article,omitempty"` SourceReleaseStage string `json:"source_release_stage,omitempty"` SourceYear string `json:"source_year,omitempty"` // TODO: should this be source_release_year RefIndex int64 `json:"ref_index,omitempty"` // 1-based RefKey string `json:"ref_key,omitempty"` RefLocator string `json:"ref_locator,omitempty"` TargetReleaseIdent string `json:"target_release_ident,omitempty"` TargetWorkIdent string `json:"target_work_ident,omitempty"` TargetOpenLibraryWork string `json:"target_openlibrary_work,omitempty"` TargetURLSurt string `json:"target_url_surt,omitempty"` TargetURL string `json:"target_url,omitempty"` MatchProvenance string `json:"match_provenance,omitempty"` MatchStatus string `json:"match_status,omitempty"` MatchReason string `json:"match_reason,omitempty"` TargetUnstructured string `json:"target_unstructured,omitempty"` TargetCSL string `json:"target_csl,omitempty"` } func (b *BiblioRef) Reset() { b.Key = "" b.IndexedTs = "" b.SourceReleaseIdent = "" b.SourceWorkIdent = "" b.SourceWikipediaArticle = "" b.SourceReleaseStage = "" b.SourceYear = "" b.RefIndex = 0 b.RefKey = "" b.RefLocator = "" b.TargetReleaseIdent = "" b.TargetWorkIdent = "" b.TargetOpenLibraryWork = "" b.TargetURLSurt = "" b.TargetURL = "" b.MatchProvenance = "" b.MatchStatus = "" b.MatchReason = "" b.TargetUnstructured = "" b.TargetCSL = "" } // Hash returns a string that will be the same, if source and target are // equal; different otherwise. This can be used to detect duplicate links. func (b *BiblioRef) LinkHash() string { switch { case b.SourceReleaseIdent != "" && b.TargetReleaseIdent != "": return fmt.Sprintf("fc:%s--fc:%s", b.SourceReleaseIdent, b.TargetReleaseIdent) case b.SourceReleaseIdent != "" && b.TargetOpenLibraryWork != "": return fmt.Sprintf("fc:%s--ol:%s", b.SourceReleaseIdent, b.TargetOpenLibraryWork) case b.SourceReleaseIdent != "" && b.TargetURL != "": return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURL) case b.SourceReleaseIdent != "" && b.TargetURLSurt != "": return fmt.Sprintf("fc:%s--wb:%s", b.SourceReleaseIdent, b.TargetURLSurt) default: var ( buf bytes.Buffer enc = gob.NewEncoder(&buf) h = sha1.New() ) if err := enc.Encode(b); err != nil { return "" } if _, err := buf.WriteTo(h); err != nil { return "" } return fmt.Sprintf("%x", h.Sum(nil)) } } // ReleaseCluster, a list of match candidates. This is typically serialized as a // single JSON line containing the match key and a list of release documents. // // Deprecated, since we are moving to a "two stream" generic "join" style // processing. type ReleaseCluster struct { Key string `json:"k"` Values []*Release `json:"v"` } // OneNonRef returns the first non-reference release found in a cluster, or an // error, if none has been found. This depends on converted references setting // extra.skate.status to "ref" - we use this in mixed clusters (catalog entries // and references converted into releases). func (rc *ReleaseCluster) OneNonRef() (*Release, error) { for _, re := range rc.Values { if re.Extra.Skate.Status != "ref" { return re, nil } } return nil, fmt.Errorf("no reference/release found for cluster key: %v", rc.Key) } // MinimalCitations variant from archive.org/details/wikipedia_citations_2020-07-14. type MinimalCitations struct { IDList string `json:"ID_list"` PageTitle string `json:"page_title"` Title string `json:"Title"` TypeOfCitation string `json:"type_of_citation"` } // IDList with commonly used identifier from wikipedia citations. type IDList struct { ISBN string `json:"isbn,omitempty"` DOI string `json:"doi,omitempty"` PMID string `json:"pmid,omitempty"` ISSN string `json:"issn,omitempty"` JSTOR string `json:"jstor,omitempty"` PMC string `json:"pmc,omitempty"` ARXIV string `json:"arxiv,omitempty"` OL string `json:"ol,omitempty"` } // IsZero returns true, if none of the identifiers is defined. func (l *IDList) IsZero() bool { return *l == IDList{} } // ParseIDList parses out the identifiers from a citation document, the IDList // values look something like this: "{BIBCODE=1992ApJ...399L..31C, // DOI=10.1086/186599}". func (c *MinimalCitations) ParseIDList() (result IDList) { if len(c.IDList) < 3 { return result } var ( s = c.IDList[1 : len(c.IDList)-1] // get rid of "{" and "}" parts = strings.Split(s, ",") pair []string ) for _, part := range parts { pair = strings.Split(part, "=") if len(pair) != 2 { continue } pair[0] = strings.TrimSpace(pair[0]) pair[1] = strings.TrimSpace(pair[1]) switch pair[0] { case "ISBN": result.ISBN = pair[1] case "DOI": result.DOI = pair[1] case "PMID": result.PMID = pair[1] case "ISSN": result.ISSN = pair[1] case "PMC": result.PMC = pair[1] case "JSTOR": result.JSTOR = pair[1] case "ARXIV": result.ARXIV = pair[1] case "OL": result.OL = pair[1] default: continue } } return result } // OpenLibrarySolrDoc from data dump (solr). Note: we most likely only need OL // editions. type OpenLibrarySolrDoc struct { AuthorFacet []string `json:"author_facet"` AuthorKey []string `json:"author_key"` AuthorName []string `json:"author_name"` CoverEditionKey string `json:"cover_edition_key"` CoverI int64 `json:"cover_i"` EbookCountI int64 `json:"ebook_count_i"` EditionCount int64 `json:"edition_count"` EditionKey []string `json:"edition_key"` FirstPublishYear int64 `json:"first_publish_year"` HasFulltext bool `json:"has_fulltext"` IdGoodreads []string `json:"id_goodreads"` IdLibrarything []string `json:"id_librarything"` Isbn []string `json:"isbn"` Key string `json:"key"` Language []string `json:"language"` LastModifiedI int64 `json:"last_modified_i"` PublishDate []string `json:"publish_date"` PublishYear []int64 `json:"publish_year"` Publisher []string `json:"publisher"` PublisherFacet []string `json:"publisher_facet"` Seed []string `json:"seed"` Text []string `json:"text"` Title string `json:"title"` TitleSuggest []string `json:"title_suggest"` Type string `json:"type"` Version int64 `json:"_version_"` } // OpenLibrarySolrDocToRelease convert OL data into a release. func OpenLibrarySolrDocToRelease(w *OpenLibrarySolrDoc) (*Release, error) { var ( release Release contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(w.AuthorName)) s = set.New() ) for i, author := range w.AuthorName { contribs[i].RawName = author } release.Title = w.Title if len(w.PublishYear) > 0 { release.ReleaseYearValue = w.FirstPublishYear } for _, v := range w.Isbn { switch { case len(v) < 13: if w, err := isbn.To13(v); err == nil { s.Add(w) } default: s.Add(v) } } if len(w.Publisher) > 0 { release.Publisher = w.Publisher[0] } release.ExtIDs.ISBN = s.Slice() release.ExtIDs.OLID = strings.Replace(w.Key, "/works/", "", 1) release.Extra.OpenLibrary.HasFulltext = w.HasFulltext return &release, nil } // OpenLibraryEdition document, see: https://openlibrary.org/developers/dumps. type OpenLibraryEdition struct { Authors []struct { Key string `json:"key"` } `json:"authors"` ByStatement string `json:"by_statement"` Covers []int64 `json:"covers"` Created struct { Type string `json:"type"` Value string `json:"value"` } `json:"created"` DeweyDecimalClass []string `json:"dewey_decimal_class"` Genres []string `json:"genres"` Identifiers struct { Goodreads []string `json:"goodreads"` Librarything []string `json:"librarything"` } `json:"identifiers"` Isbn10 []string `json:"isbn_10"` Isbn13 []string `json:"isbn_13"` Key string `json:"key"` Languages []struct { Key string `json:"key"` } `json:"languages"` LastModified struct { Type string `json:"type"` Value string `json:"value"` } `json:"last_modified"` LatestRevision int64 `json:"latest_revision"` LcClassifications []string `json:"lc_classifications"` Lccn []string `json:"lccn"` // null // { // "type": "/type/text", // "value": "Includes bibliographical references (p. 137-143)." // } // null // "Includes bibliographical references (p. 203-205) and index." // null Notes interface{} `json:"notes"` NumberOfPages int64 `json:"number_of_pages"` Ocaid string `json:"ocaid"` Pagination string `json:"pagination"` PublishCountry string `json:"publish_country"` PublishDate string `json:"publish_date"` PublishPlaces []string `json:"publish_places"` Publishers []string `json:"publishers"` Revision int64 `json:"revision"` SourceRecords []string `json:"source_records"` SubjectPlace []string `json:"subject_place"` Subjects []string `json:"subjects"` Subtitle string `json:"subtitle"` Title string `json:"title"` Type struct { Key string `json:"key"` } `json:"type"` Works []struct { Key string `json:"key"` } `json:"works"` } // Isbns returns all found ISBN: unique, sorted, non-normalized. func (v OpenLibraryEdition) Isbns() []string { s := set.New() for _, w := range v.Isbn10 { s.Add(w) } for _, w := range v.Isbn13 { s.Add(w) } return s.Sorted() } // OpenLibraryEditionToRelease convert OL edition record into a release. Takes // a (potentially empty) author map to replace author keys (like // "/author/OL18273A") with names. func OpenLibraryEditionToRelease(v *OpenLibraryEdition, authorMap map[string]string) (*Release, error) { var ( release Release contribs = make([]struct { Index int `json:"index,omitempty"` RawName string `json:"raw_name,omitempty"` Role string `json:"role,omitempty"` }, len(v.Authors)) ) for i, author := range v.Authors { // /authors/OL1076839A Ralph S. Hattox // /authors/OL1054947A Conferenza economica dell'area pisana (1984 Pisa, Italy) // /authors/OL1069856A Terry S. Boutet // /authors/OL1077217A Pamela Beck // ... name, ok := authorMap[author.Key] if !ok { continue } contribs[i].RawName = name } release.Ident = v.Key release.Contribs = contribs if v.Subtitle != "" { release.Title = fmt.Sprintf("%s: %s", v.Title, v.Subtitle) release.Extra.SubtitleValue = v.Subtitle } else { release.Title = v.Title } release.ExtIDs.ISBN = v.Isbns() if len(v.Publishers) > 0 { release.Publisher = v.Publishers[0] } if len(v.Works) > 0 { // TODO: We should be ok with just the basename, e.g. just "OL10896658M" release.Extra.OpenLibrary.WorkID = v.Works[0].Key release.WorkID = v.Works[0].Key } // e.g. // "source_records": [ // "amazon:0531203093", // "ia:multiplebirths0000land", // "marc:marc_loc_2016/BooksAll.2016.part25.utf8:104915596:921" // ] release.Extra.OpenLibrary.SourceRecords = v.SourceRecords for _, l := range openLibraryDateLayouts { t, err := time.Parse(l, v.PublishDate) if err != nil { continue } release.ReleaseYearValue = t.Year() break } return &release, nil } // ShortenOpenLibraryIdentifier shortens an id like "/books/OL10899962M" to // "OL10899962M" or returns the same string, if no simplifications can be made. // TODO: add test. func ShortenOpenLibraryIdentifier(s string) string { fields := strings.Split(s, "/") for _, f := range fields { if strings.HasPrefix(f, "OL") { return f } } return s }