Source code for pyhpo.parser.obo

"""
Parse the OBO flat-file
"""
import os
from typing import Callable, Dict, Iterator, List

from pyhpo.config import TRUTH


FILENAME = 'hp.obo'


class Metadata:
    format_version: str
    data_version: str
    header: List[str] = []

    @classmethod
    def add_header_row(cls, row: str) -> None:
        cls.header.append(row)


class Converter:
    key_conversion: Dict[str, str] = {
        'def': 'definition'
    }

    type_conversions: Dict[str, Callable] = {}

    @classmethod
    def add_type_conversion(cls, key: str, func: Callable) -> None:
        cls.type_conversions[key] = func

    @staticmethod
    def array_to_str(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> str:
        if len(value):
            return value[0]
        return ''

    @staticmethod
    def array_to_bool(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> bool:
        if not len(value):
            return False
        return value[0].lower() in TRUTH

    @staticmethod
    def parse_synonym(
        value: List[str],
        key: str,
        values: List[List[str]]
    ) -> List[str]:
        """
        Extracts the synonym from the synonym data line in the obo file format

        Parameters
        ----------
        synonym: str
            value part of synonym-data line of obo file

            e.g: "Multicystic dysplastic kidney" EXACT []

        Returns
        -------
        str
            Actual synonym title

            e.g.: Multicystic dysplastic kidney
        """
        return [x.split('"')[1] for x in value]


Converter.add_type_conversion('id', Converter.array_to_str)
Converter.add_type_conversion('name', Converter.array_to_str)
Converter.add_type_conversion('comment', Converter.array_to_str)
Converter.add_type_conversion('definition', Converter.array_to_str)
Converter.add_type_conversion('is_obsolete', Converter.array_to_bool)
Converter.add_type_conversion('replaced_by', Converter.array_to_str)
Converter.add_type_conversion('synonym', Converter.parse_synonym)


[docs]def terms_from_file(data_folder: str) -> Iterator[dict]: """ Reads an obo file line by line to yield a dict for building an HPOTerm Parameters ---------- data_folder: Full path to ``obo`` file """ filename = os.path.join(data_folder, FILENAME) with open(filename) as fh: # everything above the first [Term] is header # and thus must not be parsed as term for line in fh: line = line.strip() if line == '[Term]': break else: Metadata.add_header_row(line) term_section: List[str] = [] for line in fh: line = line.strip() if line == '[Term]': yield parse_obo_section(term_section) term_section = [] elif line == "[Typedef]": # we're currently not parsing an Typedef section. # Since they only appear at the end of the OBO file # we're stopping the parsing here. # TODO: Instead of break, add logic to skip all Typedef # sections and continue with term parsing break else: term_section.append(line) yield parse_obo_section(term_section)
[docs]def parse_obo_section(term_section: List[str]) -> dict: """ Parses the section of an OBO file for one single HPO term Parameters ---------- term_section: Lines of the ``obo`` file that describe the HPO term """ term_data = {} for line in term_section: if line == '': continue key, value = line.split(':', 1) if key not in term_data: term_data[key] = [value.strip()] else: term_data[key].append(value.strip()) term_dict = _convert_dict_keys(term_data) term_dict = _convert_value_types(term_data) return term_dict
def _convert_dict_keys(term_data: dict) -> dict: """ The HPO obo flat file contains some unfortunate attribute names. This function will convert them into the actual attributes for ``HPOTerm`` """ for old, new in Converter.key_conversion.items(): term_data[new] = term_data.pop(old, []) return term_data def _convert_value_types(term_data: dict) -> dict: for key, convert in Converter.type_conversions.items(): term_data[key] = convert( term_data.get(key, []), key, term_data ) return term_data