Source code for pyhpo.matrix

from typing import Iterable, Any, Optional, List, Union, Tuple


[docs]class Matrix: """ # noqa: E501 Poor man's implementation of a DataFrame/Matrix This is used to calculate similarities between HPO sets and is surprisingly much faster than using pandas DataFrames .. note:: Pandas:: ===== COMPARING SETS ====== 23806489 function calls (23770661 primitive calls) in 19.705 seconds ncalls tottime percall cumtime percall filename:lineno(function) .... 9900 0.267 0.000 19.106 0.002 set.py:318(similarity) 9900 1.124 0.000 14.330 0.001 set.py:477(_sim_score) .... Matrix:: ===== COMPARING SETS ====== 12870433 function calls in 6.642 seconds ncalls tottime percall cumtime percall filename:lineno(function) .... 9900 0.048 0.000 6.424 0.001 set.py:316(similarity) 9900 0.928 0.000 5.112 0.001 set.py:432(_sim_score) .... .. warning:: This `Matrix` should not be used as a public interface. It's only used internally for calculations. Parameters ---------- rows: int The number of rows in the Matrix cols: int The number of columns in the Matrix data: list of values, default ``None`` A list with values to fill the Matrix. Attributes ---------- n_rows: int The number of rows in the Matrix n_cols: int The number of columns in the Matrix rows: iterator Iterator over all rows **Example:** :: print(matrix) >> || 0| 1| 2| 3| >> ========================= >> 0 || 11| 12| 13| 14| >> 1 || 21| 22| 23| 24| >> 2 || 31| 32| 33| 34| for row in matrix.rows: print(row) >> [11, 12, 13, 14] >> [21, 22, 23, 24] >> [31, 32, 33, 34] columns: iterator Iterator over all columns **Example:** :: print(matrix) >> || 0| 1| 2| 3| >> ========================= >> 0 || 11| 12| 13| 14| >> 1 || 21| 22| 23| 24| >> 2 || 31| 32| 33| 34| for col in matrix.columns: print(col) >> [11, 21, 31] >> [12, 22, 32] >> [13, 23, 33] >> [14, 24, 34] """ def __init__( self, rows: int, cols: int, data: Optional[List[Any]] = None ): self.n_rows = rows self.n_cols = cols self._data: List[Any] if data is None: self._data = [None] * rows * cols elif len(data) == rows * cols: self._data = data else: raise RuntimeError('Wrong number of data items in `data`') def __getitem__(self, key: Tuple[Optional[int], Optional[int]]) -> Any: idx = self._get_key_indicies(key) return self._data[idx] def __setitem__( self, key: Tuple[Optional[int], Optional[int]], val: Any ) -> None: idx = self._get_key_indicies(key) if isinstance(idx, int) or len(self._data[idx]) == len(val): self._data[idx] = val else: raise ValueError('Different length of matrix subset and values') def _get_key_indicies( self, key: Tuple[Optional[int], Optional[int]] ) -> Union[int, slice]: row = key[0] col = key[1] if row is None and isinstance(col, int): # Return one column return slice( col, self.n_rows * (self.n_cols) + col, self.n_cols ) if col is None and isinstance(row, int): # Return one row return slice( row * self.n_cols, row * self.n_cols + self.n_cols, 1 ) if isinstance(row, int) and isinstance(col, int): if row > self.n_rows - 1: raise RuntimeError('Invalid row number: {}'.format(row)) if col > self.n_cols - 1: raise RuntimeError('Invalid column number: {}'.format(col)) return row * self.n_cols + col raise RuntimeError('Invalid arguments for Matrix subset') @property def rows(self) -> Iterable[Any]: for x in range(self.n_rows): yield self[x, None] @property def columns(self) -> Iterable[Any]: for x in range(self.n_cols): yield self[None, x] def __str__(self) -> str: maxlength = max( [len(str(self.n_cols))] + [len(str(x)) for x in self._data] ) + 2 idxlength = len(str(self.n_rows)) + 2 s = '' s += '{}||'.format(''.rjust(idxlength)) s += ''.join([ '{}|'.format(str(x).rjust(maxlength)) for x in range(self.n_cols) ]) s += '\n' + '=' * len(s) for idx, item in enumerate(self._data): if idx % self.n_cols == 0: s += '\n{}||'.format( str(int(idx/self.n_cols)).ljust(idxlength) ) s += '{}|'.format(str(item).rjust(maxlength)) return s