diff options
| author | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 | 
|---|---|---|
| committer | bnewbold <bnewbold@archive.org> | 2020-04-01 22:03:19 +0000 | 
| commit | 32f195cec41459045f3d3453dad7a97b38d4e288 (patch) | |
| tree | ab166daf8686472ed9641d96ab055f37ee89d71c /python/fatcat_tools | |
| parent | 0e2025091d0c974a888a5bc741495951c952ccda (diff) | |
| parent | 938d2c5366d80618b839c83baadc9b5c62d10dce (diff) | |
| download | fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.tar.gz fatcat-32f195cec41459045f3d3453dad7a97b38d4e288.zip | |
Merge branch 'bnewbold-pubmed-get_text' into 'master'
beautifulsoup XML parsing: .string vs. .get_text()
See merge request webgroup/fatcat!40
Diffstat (limited to 'python/fatcat_tools')
| -rw-r--r-- | python/fatcat_tools/importers/arxiv.py | 22 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jalc.py | 14 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/jstor.py | 18 | ||||
| -rw-r--r-- | python/fatcat_tools/importers/pubmed.py | 32 | 
4 files changed, 47 insertions, 39 deletions
| diff --git a/python/fatcat_tools/importers/arxiv.py b/python/fatcat_tools/importers/arxiv.py index c69ee16a..719592fc 100644 --- a/python/fatcat_tools/importers/arxiv.py +++ b/python/fatcat_tools/importers/arxiv.py @@ -118,13 +118,13 @@ class ArxivRawImporter(EntityImporter):              if not (doi.startswith('10.') and '/' in doi and doi.split('/')[1]):                  sys.stderr.write("BOGUS DOI: {}\n".format(doi))                  doi = None -        title = latex_to_text(metadata.title.string) -        authors = parse_arxiv_authors(metadata.authors.string) +        title = latex_to_text(metadata.title.get_text().replace('\n', ' ')) +        authors = parse_arxiv_authors(metadata.authors.get_text().replace('\n', ' '))          contribs = [fatcat_openapi_client.ReleaseContrib(index=i, raw_name=a, role='author') for i, a in enumerate(authors)]          lang = "en"     # the vast majority in english -        if metadata.comments and metadata.comments.string: -            comments = metadata.comments.string.strip() +        if metadata.comments and metadata.comments.get_text(): +            comments = metadata.comments.get_text().replace('\n', ' ').strip()              extra_arxiv['comments'] = comments              if 'in french' in comments.lower():                  lang = 'fr' @@ -145,8 +145,8 @@ class ArxivRawImporter(EntityImporter):              # more languages?          number = None -        if metadata.find('journal-ref') and metadata.find('journal-ref').string: -            journal_ref = metadata.find('journal-ref').string.strip() +        if metadata.find('journal-ref') and metadata.find('journal-ref').get_text(): +            journal_ref = metadata.find('journal-ref').get_text().replace('\n', ' ').strip()              extra_arxiv['journal_ref'] = journal_ref              if "conf." in journal_ref.lower() or "proc." in journal_ref.lower():                  release_type = "paper-conference" @@ -160,16 +160,16 @@ class ArxivRawImporter(EntityImporter):                  release_type = "report"          if metadata.find('acm-class') and metadata.find('acm-class').string:              extra_arxiv['acm_class'] = metadata.find('acm-class').string.strip() -        if metadata.categories and metadata.categories.string: -            extra_arxiv['categories'] = metadata.categories.string.split() +        if metadata.categories and metadata.categories.get_text(): +            extra_arxiv['categories'] = metadata.categories.get_text().split()          license_slug = None -        if metadata.license and metadata.license.string: -            license_slug = lookup_license_slug(metadata.license.string) +        if metadata.license and metadata.license.get_text(): +            license_slug = lookup_license_slug(metadata.license.get_text())          abstracts = None          if metadata.abstract:              # TODO: test for this multi-abstract code path              abstracts = [] -            abst = metadata.abstract.string.strip() +            abst = metadata.abstract.get_text().strip()              orig = None              if '-----' in abst:                  both = abst.split('-----') diff --git a/python/fatcat_tools/importers/jalc.py b/python/fatcat_tools/importers/jalc.py index 351a20a3..c2adc0d6 100644 --- a/python/fatcat_tools/importers/jalc.py +++ b/python/fatcat_tools/importers/jalc.py @@ -35,13 +35,13 @@ def parse_jalc_persons(raw_persons):      for raw in raw_persons:          name = raw.find('name') or None          if name: -            name = clean(name.string) +            name = clean(name.get_text().replace('\n', ' '))          surname = raw.find('familyName') or None          if surname: -            surname = clean(surname.string) +            surname = clean(surname.get_text().replace('\n', ' '))          given_name = raw.find('givenName') or None          if given_name: -            given_name = clean(given_name.string) +            given_name = clean(given_name.get_text().replace('\n', ' '))          lang = 'en'          if is_cjk(name):              lang = 'ja' @@ -163,12 +163,12 @@ class JalcImporter(EntityImporter):          titles = record.find_all("title")          if not titles:              return None -        title = titles[0].string.strip() +        title = titles[0].get_text().replace('\n', ' ').strip()          original_title = None          if title.endswith('.'):              title = title[:-1]          if len(titles) > 1: -            original_title = titles[1].string.strip() +            original_title = titles[1].get_text().replace('\n', ' ').strip()              if original_title.endswith('.'):                  original_title = original_title[:-1] @@ -242,7 +242,7 @@ class JalcImporter(EntityImporter):          container_extra = dict()          if record.publicationName: -            pubs = [p.string.strip() for p in record.find_all("publicationName") if p.string] +            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publicationName") if p.get_text()]              pubs = [clean(p) for p in pubs if p]              assert(pubs)              if len(pubs) > 1 and pubs[0] == pubs[1]: @@ -255,7 +255,7 @@ class JalcImporter(EntityImporter):                  container_extra['original_name'] = clean(pubs[1])          if record.publisher: -            pubs = [p.string.strip() for p in record.find_all("publisher") if p.string] +            pubs = [p.get_text().replace('\n', ' ').strip() for p in record.find_all("publisher") if p.get_text()]              pubs = [p for p in pubs if p]              if len(pubs) > 1 and pubs[0] == pubs[1]:                  pubs = [pubs[0]] diff --git a/python/fatcat_tools/importers/jstor.py b/python/fatcat_tools/importers/jstor.py index 5ff1ecd9..96dbf947 100644 --- a/python/fatcat_tools/importers/jstor.py +++ b/python/fatcat_tools/importers/jstor.py @@ -63,13 +63,13 @@ class JstorImporter(EntityImporter):          release_type = JSTOR_TYPE_MAP.get(article['article-type'])          title = article_meta.find("article-title") -        if title and title.string: -            title = title.string.strip() -        elif title and not title.string: +        if title and title.get_text(): +            title = title.get_text().replace('\n', ' ').strip() +        elif title and not title.get_text():              title = None          if not title and release_type.startswith('review') and article_meta.product.source: -            title = "Review: {}".format(article_meta.product.source.string) +            title = "Review: {}".format(article_meta.product.source.replace('\n', ' ').get_text())          if not title:              return None @@ -96,8 +96,8 @@ class JstorImporter(EntityImporter):          if journal_ids:              extra_jstor['journal_ids'] = journal_ids -        journal_title = journal_meta.find("journal-title").string -        publisher = journal_meta.find("publisher-name").string +        journal_title = journal_meta.find("journal-title").get_text().replace('\n', ' ') +        publisher = journal_meta.find("publisher-name").get_text().replace('\n', ' ')          issn = journal_meta.find("issn")          if issn:              issn = issn.string @@ -141,13 +141,13 @@ class JstorImporter(EntityImporter):              for c in cgroup.find_all("contrib"):                  given = c.find("given-names")                  if given: -                    given = clean(given.string) +                    given = clean(given.get_text().replace('\n', ' '))                  surname = c.find("surname")                  if surname: -                    surname = clean(surname.string) +                    surname = clean(surname.get_text().replace('\n', ' '))                  raw_name = c.find("string-name")                  if raw_name: -                    raw_name = clean(raw_name.string) +                    raw_name = clean(raw_name.get_text().replace('\n', ' '))                  if not raw_name:                      if given and surname: diff --git a/python/fatcat_tools/importers/pubmed.py b/python/fatcat_tools/importers/pubmed.py index 3ecf5ef4..abcb21d9 100644 --- a/python/fatcat_tools/importers/pubmed.py +++ b/python/fatcat_tools/importers/pubmed.py @@ -392,8 +392,9 @@ class PubmedImporter(EntityImporter):          if pages:              pages = pages.string -        title = medline.Article.ArticleTitle.string # always present +        title = medline.Article.ArticleTitle.get_text() # always present          if title: +            title = title.replace('\n', ' ')              if title.endswith('.'):                  title = title[:-1]              # this hides some "special" titles, but the vast majority are @@ -406,20 +407,27 @@ class PubmedImporter(EntityImporter):          original_title = medline.Article.find("VernacularTitle", recurse=False)          if original_title: -            original_title = original_title.string or None +            original_title = original_title.get_text() or None +            original_title = original_title.replace('\n', ' ')              if original_title and original_title.endswith('.'):                  original_title = original_title[:-1] +        if original_title and not title: +            # if we only have an "original" title, but not translated/english +            # title, sub in the original title so the entity can be created +            title = original_title +            original_title = None +          # TODO: happening in alpha order, not handling multi-language well.          language = medline.Article.Language          if language: -            language = language.string +            language = language.get_text()              if language in ("und", "un"):                  # "undetermined"                  language = None              else:                  language = LANG_MAP_MARC.get(language) -                if not language and not (medline.Article.Language.string in LANG_MAP_MARC): +                if not language and not (medline.Article.Language.get_text() in LANG_MAP_MARC):                      warnings.warn("MISSING MARC LANG: {}".format(medline.Article.Language.string))          ### Journal/Issue Metadata @@ -479,7 +487,7 @@ class PubmedImporter(EntityImporter):                  print("unparsable medline date, skipping: {}".format(medline_date), file=sys.stderr)          if journal.find("Title"): -            container_name = journal.Title.string +            container_name = journal.Title.get_text()          if (container_id is None and self.create_containers and (issnl is not None)                  and container_name): @@ -558,15 +566,15 @@ class PubmedImporter(EntityImporter):                  surname = None                  raw_name = None                  if author.ForeName: -                    given_name = author.ForeName.string +                    given_name = author.ForeName.get_text().replace('\n', ' ')                  if author.LastName: -                    surname = author.LastName.string +                    surname = author.LastName.get_text().replace('\n', ' ')                  if given_name and surname:                      raw_name = "{} {}".format(given_name, surname)                  elif surname:                      raw_name = surname -                if not raw_name and author.CollectiveName and author.CollectiveName.string: -                    raw_name = author.CollectiveName.string +                if not raw_name and author.CollectiveName and author.CollectiveName.get_text(): +                    raw_name = author.CollectiveName.get_text().replace('\n', ' ')                  contrib_extra = dict()                  orcid = author.find("Identifier", Source="ORCID")                  if orcid: @@ -588,9 +596,9 @@ class PubmedImporter(EntityImporter):                  affiliations = author.find_all("Affiliation")                  raw_affiliation = None                  if affiliations: -                    raw_affiliation = affiliations[0].string +                    raw_affiliation = affiliations[0].get_text().replace('\n', ' ')                      if len(affiliations) > 1: -                        contrib_extra['more_affiliations'] = [ra.string for ra in affiliations[1:]] +                        contrib_extra['more_affiliations'] = [ra.get_text().replace('\n', ' ') for ra in affiliations[1:]]                  if author.find("EqualContrib"):                      # TODO: schema for this?                      contrib_extra['equal'] = True @@ -638,7 +646,7 @@ class PubmedImporter(EntityImporter):                          ref_release_id = self.lookup_pmid(ref_pmid)                  ref_raw = ref.Citation                  if ref_raw: -                    ref_extra['unstructured'] = ref_raw.string +                    ref_extra['unstructured'] = ref_raw.get_text()                  if not ref_extra:                      ref_extra = None                  refs.append(fatcat_openapi_client.ReleaseRef( | 
