Source code for pyhpo.stats

from typing import Callable, Dict, List, Union, Tuple

try:
    from scipy.stats import hypergeom  # type: ignore[import]
except ImportError:
    print(
        'The pyhpo.stats module requires that you install scipy.',
        '\n\n#######################################################'
        '\n\n#   ==> Please install scipy via `pip install scipy`  #'
        '\n\n#######################################################\n\n'
    )
    raise ImportError()

import pyhpo
from pyhpo import Ontology
from pyhpo import HPOSet


def hypergeom_test(
    positive_samples: int,
    samples: int,
    positive_total: int,
    total: int
) -> float:
    """
    Wrapper function to call the scipy hypergeometric stats function

    Parameters
    ----------
        positive_samples: int
            Number of successes in the sample set (correctly drawn marbles)
        samples: int
            Total number of samples (number of drawn marbles)
        positive_total: int
            Number of positives in the reference set
            (number of positive marbles in the bag)
        total: int
            Total size of reference set
            (number of marbles in the bag)

    Returns
    -------
    float
        The hypergeometic enrichment score

    """
    return float(hypergeom.sf(
        positive_samples-1,  # likelyhood of more than X, #see https://blog.alexlenail.me/understanding-and-implementing-the-hypergeometric-test-in-python-a7db688a7458  # noqa: 501
        total,
        positive_total,
        samples
    ))


[docs]class HPOEnrichment():
    """
    Calculates the enrichment of HPO Terms in an Annotation set.

    You can use this class for the following example use cases:

    * You have a list of genes and want to see if some HPO terms are
      enriched in that group. (e.g. RNAseq differential gene expression)
    * You have a list of OMIM diseases and want to see if they have
      some underlying HPO symptom.

    Parameters
    ----------
    category: str
        String to declare if enrichment is done for genes or for OMIM diseases

        Options are:

        * **gene**
        * **omim**

    """
    def __init__(self, category: str) -> None:
        category_lookup = {
            'gene': Ontology.genes,
            'omim': Ontology.omim_diseases
        }
        self.hpos, self.total = self._hpo_count(
            category_lookup[category]  # type: ignore[arg-type]
        )

[docs]    def enrichment(
        self,
        method: str,
        annotation_sets: List['pyhpo.Annotation']
    ) -> List[dict]:
        """
        Calculates the enrichment of HPO terms in the provided annotation set

        Parameters
        ----------
        method: str
            The statistical test for enrichment

            * **hypergeom** Hypergeometric distribution test

        annotation_sets: list of ``annoation``
            Every ``annotation`` item in the list must have an attribute
            ``hpos``, being a list of HPO-Term indicies

        Returns
        -------
        list of dict
            The enrichment of every HPO term in the ``annotation_sets`` list,
            sorted by descending enrichment. Every dict has the following keys:

            * **hpo**: :class:`.HPOTerm`
            * **count**: Number of appearances in the sets
            * **enrichment**: Enrichment score

        """
        list_counts, list_total = self._hpo_count(annotation_sets)
        res = [{
            'hpo': Ontology[hpo],
            'count': count,
            'enrichment': self._single_enrichment(
                method,
                hpo,
                count,
                list_total
            )} for hpo, count in list_counts.items()
        ]

        return sorted(res, key=lambda x: x['enrichment'])

    def _hpo_count(
        self,
        annotation_sets: List['pyhpo.Annotation']
    ) -> Tuple[dict, int]:
        """
        Counts the number of occurrenes of every HPO term
        in the ``annotation_sets``

        Parameters
        ----------
            annotation_set: list of ``annoation``
                Every ``annotation`` item in the list must have an attribute
                ``hpos``, returning an iterable of :class:`.HPOTerm`

        Returns
        -------
        tuple with following items:
            * Dict with
                * key: :class:`.HPOTerm`
                * value: int <Number of occurences>
            * Total number of HPO terms in set
        """
        hpos = {}
        for item in annotation_sets:
            for term in item.hpo:
                if term not in hpos:
                    hpos[term] = 0
                hpos[term] += 1
        return (hpos, sum(hpos.values()))

    def _single_enrichment(
        self,
        method: str,
        hpo_id: Union[int, 'pyhpo.HPOTerm'],
        positives: int,
        samples: int
    ) -> float:
        """
        Calculates the enrichment of a single HPO term compared to
        the reference set

        Parameters
        ----------
            method: str
                The statistical test for enrichment

                * **hypergeom** Hypergeometric distribution test

            hpo_id: int or :class:`.HPOTerm`
                ID of the HPO Term
            positives: int
                Number of successes in the sample set (correctly drawn marbles)
            samples: int
                Total number of samples (number of drawn marbles)

        Returns
        -------
        float
            The enrichment score
        """
        try:
            positive_total = self.hpos[hpo_id]
        except KeyError:
            raise RuntimeError(
                'The HPO term {} is not present in the '
                'reference population'.format(hpo_id)
            )
        if method == 'hypergeom':
            return hypergeom_test(
                positives,
                samples,
                positive_total,
                self.total
            )
        else:
            raise NotImplementedError('Enrichment method not implemented')


[docs]class EnrichmentModel():
    """
    Calculates the enrichment of annotations in an :class:`.HPOSet`.

    You can use this class for the following example use cases:

    * You have a set of HPOTerms and want to find the
      most likely causative gene
    * You have a set of HPOTerms and want to find the underlying disease

    Parameters
    ----------
    category: str
        String to declare if enrichment is done for genes or for OMIM diseases

        Options are:

        * **gene**
        * **omim**
        * **orpha**
        * **decipher**

    """
    attribute_lookup: Dict[str, Callable] = {
        'gene': lambda x: x.genes,
        'omim': lambda x: x.omim_diseases,
        'orpha': lambda x: x.orpha_diseases,
        'decipher': lambda x: x.decipher_diseases
    }

    def __init__(self, category: str) -> None:
        self.attribute = self.attribute_lookup[category]
        self.base_count, self.total = self._population_count(HPOSet(Ontology))

[docs]    def enrichment(
        self,
        method: str,
        hposet: HPOSet
    ) -> List[dict]:
        """
        Calculates the enrichment of annotations in the provided HPOSet

        Parameters
        ----------
        method: str
            The statistical test for enrichment

            * **hypergeom** Hypergeometric distribution test

        hposet: :class:`.HPOSet`

        Returns
        -------
        list of dict
            The enrichment of every annotation item sorted by
            descending enrichment. Every dict has the following keys:

            * **item**: Gene or OMIM or Decipher annotation item
            * **count**: Number of appearances in the sets
            * **enrichment**: Enrichment score

        """
        list_counts, list_total = self._population_count(hposet)
        res = [{
            'item': item,
            'count': count,
            'enrichment': self._single_enrichment(
                method,
                item,
                count,
                list_total
            )} for item, count in list_counts.items()
        ]
        return sorted(res, key=lambda x: x['enrichment'])

    def _population_count(self, hopset: HPOSet) -> Tuple[dict, int]:
        """
        Counts the number of occurrenes of every annotation item
        in the HPOSet

        Parameters
        ----------
            hposet: :class:`.HPOSet`

        Returns
        -------
        tuple with following items:
            * Dict with
                * key: Annotation Item
                * value: int <Number of occurences>
            * Total number of annotations in set
        """
        population = {}
        for term in hopset:
            for item in self.attribute(term):
                if item not in population:
                    population[item] = 0
                population[item] += 1
        return population, sum(population.values())

    def _single_enrichment(
        self,
        method: str,
        item_id: int,
        positives: int,
        samples: int
    ) -> float:
        """
        Calculates the enrichment of annotations in an HPO set

        Parameters
        ----------
            method: str
                The statistical test for enrichment

                * **hypergeom** Hypergeometric distribution test

            item_id: int
                ID of the Annotation
            positives: int
                Number of successes in the sample set (correctly drawn marbles)
            samples: int
                Total number of samples (number of drawn marbles)

        Returns
        -------
        float
            The enrichment score
        """
        try:
            positive_total = self.base_count[item_id]
        except KeyError:
            raise RuntimeError(
                'The item {} is not present in the '
                'reference population'.format(item_id)
            )
        if method == 'hypergeom':
            return hypergeom_test(
                positives,
                samples,
                positive_total,
                self.total
            )
        else:
            raise NotImplementedError('Enrichment method not implemented')