Source code for collabutils.zotero

"""
Zotero provides a good platform to collaborate on bibliogrpahies.

We support collaborative curation of bibliographical data as follows:

- The data is curated in a shared Zotero library.
- Items in this library are tagged for membership in the bibliographies of certain datasets, \
  using tags of the form "dataset:<dataset-id>".
- Items are optionally tagged with tags of the form "id:<dataset-id>:<local-id>" for local \
  BibTeX keys in a specific dataset.

An instance of :class:`Zotero` allows

- seeding such libraries from a (possibly stub) BibTeX file via :meth:`Zotero.upload_bib`
- downloading such libraries via :meth:`Zotero.download_bib`
- deleting such libraries via :meth:`Zotero.delete_bib`
"""
import re
import copy
import pathlib
import collections

from pycldf.sources import Source

from collabutils.util import warn
try:
    from pybtex.database import parse_string
    from pyzotero.zotero import Zotero as API
except ImportError as e:  # pragma: no cover
    warn('zotero.Zotero', 'zotero', e)

__all__ = ['Zotero']

BIBTEX_TO_ZOTERO = {
    'article': 'journalArticle',
    'book': 'book',
    'booklet': 'book',
    'conference': 'conferencePaper',
    'inbook': 'bookSection',
    'incollection': 'bookSection',
    'inproceeding': 'conferencePaper',
    # manual
    'mastersthesis': 'thesis',
    'misc': 'document',
    'phdthesis': 'thesis',  # + thesisType! 'PhD thesis' or 'masters thesis'
    # proceeding
    'techreport': 'report',
    'unpublished': 'manuscript',
    # BibLaTeX
    # mvbook
    # bookinbook
    # suppbook
    # booklet
    # collection
    # mvcollection
    # incollection
    # suppcollection
    # online
    # patent
    # periodical
    # suppperiodical
    # mvproceedings
    # reference
    # mvreference
    # inreference
    # report
    # set
    'thesis': 'thesis',
    # custom
    # electronic
}

FIELDS_TO_ZOTERO = {
    'address': 'place',
    # 'author -> creators
    'booktitle': 'bookTitle',
    'doi': 'DOI',
    # 'editor -> creators
    'journal': 'publicationTitle',
    'school': 'institution',
    'year': 'date',
    'abstract': 'abstractNote',
}


[docs]class Zotero:
    """
    A high-level Zotero API client. Low level communication with Zotero is done using the
    `pyzotero` package.

    .. seealso:: `<https://pypi.org/project/Pyzotero/>`_
    """
    def __init__(self, libid, apikey, group=True):
        """
        :param libid: Numeric ID of the Zotero library
        :param apikey: API key with read/write permissions for the library
        :param group: Flag signaling whether we are dealing with a group or user library
        """
        self.api = API(libid, 'group' if group else 'user', apikey)
        self._item_templates = {}

    @staticmethod
    def id_tag(dataset_id, bibkey):
        return 'id:{}:{}'.format(dataset_id, bibkey)

    @staticmethod
    def id_from_tagstring(s, dataset_id):
        m = re.search(r'id:{}:(?P<id>[^\s,]+)'.format(re.escape(dataset_id)), s)
        if m:
            return m.group('id')

    @staticmethod
    def dataset_tag(dataset_id):
        return 'dataset:{}'.format(dataset_id)

[docs]    def upload_bib(self, dataset_id, bibpath, log=None):
        """
        Upload items from a BibTeX file as items of the Zotero library for a dataset.

        :param dataset_id: Dataset identifier (e.g. a `cldfbench.Dataset.id`).
        :param bibpath: Path to the BibTeX file to be uploaded.
        :param log:
        :return:
        """
        res = collections.OrderedDict()
        db = parse_string(pathlib.Path(bibpath).read_text(encoding='utf8'), 'bibtex')
        for key, entry in db.entries.items():
            src = Source.from_entry(key, entry)
            tag = self.id_tag(dataset_id, src.id)
            try:
                _ = self.api.items(tag=tag, limit=1)[0]
                if log:  # pragma: no cover
                    log.info('Skipping {}'.format(tag))
                res[src.id] = True
            except IndexError:
                res[src.id] = self.create_item(src, tag, self.dataset_tag(dataset_id))
        return res

[docs]    def download_bib(self, dataset_id, bibpath=None, remove=None):
        """
        Download all items tagged for a dataset into a BibTeX file.

        :param dataset_id:
        :param bibpath:
        :param remove: List of field names to remove (e.g. the "keywords" field, which is popuated \
        by Zotero's BibTeX export with all tags) or `None`.
        :return:
        """
        remove = remove or []
        bib = []
        for item in self.get_items(self.dataset_tag(dataset_id), content='bibtex'):
            src = self._bibtex2source(item)
            # determine the dataset-local bibkey:
            id_ = self.id_from_tagstring(src.get('keywords', ''), dataset_id)
            if id_:
                src.id = id_
            for k in remove:
                if k in src:
                    del src[k]
            bib.append(src.bibtex())
        bib = '\n\n'.join(bib)
        if bibpath:
            pass
        return bib

[docs]    def delete_bib(self, dataset_id, log=None):
        """
        Delete all items tagged for a dataset - unless they are tagged for other datasets as well.

        :param dataset_id:
        :param log:
        :return:
        """
        for item in self.get_items(self.dataset_tag(dataset_id)):
            if any(':' + dataset_id not in t['tag'] for t in item['data']['tags']):
                if log:
                    log.warning('Skipping item linked to other datasets')
                continue
            self.api.delete_item(item)

    def get_items(self, tag, **kw):
        return self.api.everything(self.api.items(tag=tag, **kw))

    @staticmethod
    def _bibtex2source(bibtex):
        src = Source.from_entry(*list(parse_string(bibtex, 'bibtex').entries.items())[0])

        def repl(m):
            if m.group('cat') == 'type':
                src.genre = m.group('value').strip()
            else:
                field, value = m.group('value').split(':', maxsplit=1)
                if field == 'note':
                    return value.strip()
                src[field] = value.strip()
            return ''

        if 'note' in src:
            note = re.sub(
                ':bibtex:(?P<cat>type|field):(?P<value>.+?):xetbib:', repl, src['note']).strip()
            if note:
                src['note'] = note
            else:
                del src['note']
        return src

    def _empty_item(self, src):
        zot_genre = BIBTEX_TO_ZOTERO.get(src.genre.lower(), 'document')
        if zot_genre not in self._item_templates:
            self._item_templates[zot_genre] = self.api.item_template(zot_genre)
        return copy.deepcopy(self._item_templates[zot_genre])

[docs]    def create_item(self, src, *tags):
        """
        """
        item = self._empty_item(src)

        extra = {'type': src.genre}
        for k, v in src.items():
            kl = k.lower()
            if kl in ['editor', 'author']:
                continue
            zotk = FIELDS_TO_ZOTERO.get(kl, kl)
            if zotk in item and isinstance(item[zotk], str):
                item[zotk] = src[k]
            else:
                extra['field:{}'.format(kl)] = src[k]

        if extra:
            item['extra'] = '\n'.join(
                [':bibtex:{}: {}:xetbib:'.format(k, v) for k, v in extra.items()])

        entry = src.entry
        item['creators'] = []
        for ct in ['author', 'editor']:
            for name in entry.persons[ct]:
                item['creators'].append(dict(
                    creatorType=ct,
                    firstName=str(name).split(',')[-1].strip(),
                    lastName=str(name).split(',')[0]))
        item['tags'].extend(tags)
        item['tags'] = [dict(tag=t) for t in item['tags']]
        return bool(self.api.create_items([item])['success'])