Source code for pyhpo.parser.obo

"""
Parse the OBO flat-file
"""
import os
from typing import Callable, Dict, Iterator, List

from pyhpo.config import TRUTH


FILENAME = 'hp.obo'


class Metadata:
    format_version: str
    data_version: str
    header: List[str] = []

    @classmethod
    def add_header_row(cls, row: str) -> None:
        cls.header.append(row)


class Converter:
    key_conversion: Dict[str, str] = {
        'def': 'definition'
    }

    type_conversions: Dict[str, Callable] = {}

    @classmethod
    def add_type_conversion(cls, key: str, func: Callable) -> None:
        cls.type_conversions[key] = func

    @staticmethod
    def array_to_str(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> str:
        if len(value):
            return value[0]
        return ''

    @staticmethod
    def array_to_bool(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> bool:
        if not len(value):
            return False
        return value[0].lower() in TRUTH

    @staticmethod
    def parse_synonym(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> List[str]:
        """
        Extracts the synonym from the synonym data line in the obo file format

        Parameters
        ----------
        synonym: str
            value part of synonym-data line of obo file

            e.g: "Multicystic dysplastic kidney" EXACT []

        Returns
        -------
        str
            Actual synonym title

            e.g.: Multicystic dysplastic kidney
        """
        return [x.split('"')[1] for x in value]


Converter.add_type_conversion('id', Converter.array_to_str)
Converter.add_type_conversion('name', Converter.array_to_str)
Converter.add_type_conversion('comment', Converter.array_to_str)
Converter.add_type_conversion('definition', Converter.array_to_str)
Converter.add_type_conversion('is_obsolete', Converter.array_to_bool)
Converter.add_type_conversion('replaced_by', Converter.array_to_str)
Converter.add_type_conversion('synonym', Converter.parse_synonym)


[docs]def terms_from_file(data_folder: str) -> Iterator[dict]:
    """
    Reads an obo file line by line to yield
    a dict for building an HPOTerm

    Parameters
    ----------
    data_folder:
        Full path to ``obo`` file

    """
    filename = os.path.join(data_folder, FILENAME)

    with open(filename) as fh:
        # everything above the first [Term] is header
        # and thus must not be parsed as term
        for line in fh:
            line = line.strip()
            if line == '[Term]':
                break
            else:
                Metadata.add_header_row(line)

        term_section: List[str] = []
        for line in fh:
            line = line.strip()
            if line == '[Term]':
                yield parse_obo_section(term_section)
                term_section = []
            elif line == "[Typedef]":
                # we're currently not parsing an Typedef section.
                # Since they only appear at the end of the OBO file
                # we're stopping the parsing here.
                # TODO: Instead of break, add logic to skip all Typedef
                # sections and continue with term parsing
                break
            else:
                term_section.append(line)

        yield parse_obo_section(term_section)


[docs]def parse_obo_section(term_section: List[str]) -> dict:
    """
    Parses the section of an OBO file for one single HPO term

    Parameters
    ----------
    term_section:
        Lines of the ``obo`` file that describe the HPO term
    """
    term_data = {}
    for line in term_section:
        if line == '':
            continue
        key, value = line.split(':', 1)
        if key not in term_data:
            term_data[key] = [value.strip()]
        else:
            term_data[key].append(value.strip())
    term_dict = _convert_dict_keys(term_data)
    term_dict = _convert_value_types(term_data)
    return term_dict


def _convert_dict_keys(term_data: dict) -> dict:
    """
    The HPO obo flat file contains some unfortunate attribute names.
    This function will convert them into the actual attributes
    for ``HPOTerm``
    """

    for old, new in Converter.key_conversion.items():
        term_data[new] = term_data.pop(old, [])

    return term_data


def _convert_value_types(term_data: dict) -> dict:
    for key, convert in Converter.type_conversions.items():
        term_data[key] = convert(
            term_data.get(key, []),
            key,
            term_data
        )

    return term_data