Source code for pydna.seq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
A subclass of the Biopython SeqRecord class.

Has a number of extra methods and uses
the :class:`pydna._pretty_str.pretty_str` class instread of str for a
nicer output in the IPython shell.
"""

# from pydna.codon import weights as _weights
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from pydna.codon import rare_codons as _rare_codons
from pydna.codon import start as _start
from pydna.codon import stop as _stop
from pydna.codon import n_end as _n_end
from seguid import lsseguid as _lsseguid
from pydna.utils import rc as _rc

from Bio.SeqUtils import seq3 as _seq3
from Bio.SeqUtils import gc_fraction as _GC
import re as _re
from Bio.Seq import Seq as _Seq
from pydna._pretty import PrettyTable as _PrettyTable

from typing import List as _List, Optional as _Optional, Tuple as _Tuple
import logging as _logging

_module_logger = _logging.getLogger("pydna." + __name__)


[docs]class Seq(_Seq):
    """docstring."""

[docs]    def translate(
        self,
        *args,
        stop_symbol: str = "*",
        to_stop: bool = False,
        cds: bool = False,
        gap: str = "-",
        **kwargs,
    ) -> "ProteinSeq":
        """Translate.."""
        p = super().translate(*args, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap, **kwargs)
        return ProteinSeq(p._data)

[docs]    def gc(self) -> float:
        """Return GC content."""
        return round(_GC(self._data.upper().decode("ASCII")), 3)

[docs]    def cai(self, organism: str = "sce") -> float:
        """docstring."""
        from pydna.utils import cai as _cai

        return _cai(self._data.upper().decode("ASCII"), organism=organism)

[docs]    def rarecodons(self, organism: str = "sce") -> _List[slice]:
        """docstring."""
        rare = _rare_codons[organism]
        s = self._data.upper().decode("ASCII")
        slices: _List[slice] = []
        for i in range(0, len(self) // 3):
            x, y = i * 3, i * 3 + 3
            trip = s[x:y]
            if trip in rare:
                slices.append(slice(x, y, 1))
        return slices

[docs]    def startcodon(self, organism: str = "sce") -> _Optional[float]:
        """docstring."""
        return _start[organism].get(self._data.upper().decode("ASCII")[:3])

[docs]    def stopcodon(self, organism: str = "sce") -> _Optional[float]:
        """docstring."""
        return _stop[organism].get(self._data.upper().decode("ASCII")[-3:])

[docs]    def express(self, organism: str = "sce") -> _PrettyTable:
        """docstring."""
        x = _PrettyTable(["cds", "len", "cai", "gc", "sta", "stp", "n-end"] + _rare_codons[organism] + ["rare"])
        val = []

        val.append(f"{self._data.upper().decode('ASCII')[:3]}..." f"{self._data.upper().decode('ASCII')[-3:]}")
        val.append(len(self) / 3)
        val.append(self.cai(organism))
        val.append(self.gc())
        val.append(self.startcodon())
        val.append(self.stopcodon())
        val.append(
            _n_end[organism].get(_seq3(self[3:6].translate())),
        )
        s = self._data.upper().decode("ASCII")
        trps = [s[i * 3 : i * 3 + 3] for i in range(0, len(s) // 3)]
        tot = 0
        for cdn in _rare_codons[organism]:
            cnt = trps.count(cdn)
            tot += cnt
            val.append(cnt)
        val.append(round(tot / len(trps), 3))
        x.add_row(val)
        return x

[docs]    def orfs2(self, minsize: int = 30) -> _List[str]:
        """docstring."""
        orf = _re.compile(f"ATG(?:...){{{minsize},}}?(?:TAG|TAA|TGA)", flags=_re.IGNORECASE)
        start = 0
        matches: _List[slice] = []
        s = self._data.decode("ASCII")

        while True:
            match = orf.search(s, pos=start)
            if match:
                matches.append(slice(match.start(), match.end()))
                start = match.start() + 1
            else:
                break
        return sorted([self[sl] for sl in matches], key=len, reverse=True)

[docs]    def orfs(self, minsize: int = 100) -> _List[_Tuple[int, int]]:
        dna = self._data.decode("ASCII")
        from pydna.utils import three_frame_orfs

        return [(x, y) for frame, x, y in three_frame_orfs(dna, limit=minsize)]

[docs]    def seguid(self) -> str:
        """Url safe SEGUID [#]_ for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be part of a URL.

        Examples
        --------
        >>> from pydna.seq import Seq
        >>> a = Seq("aa")
        >>> a.seguid()
        'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU'

        References
        ----------
        .. [#] http://wiki.christophchamp.com/index.php/SEGUID
        """
        return _lsseguid(self._data.decode("utf8").upper(), alphabet="{DNA-extended}")

    def __getitem__(self, key):
        result = super().__getitem__(key)
        try:
            result.__class__ = self.__class__
        except TypeError:
            pass
        return result

[docs]    def reverse_complement(self):
        return self.__class__(_rc(self._data))

    rc = reverse_complement


[docs]class ProteinSeq(_Seq):
    """docstring."""

[docs]    def translate(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def complement(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def complement_rna(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def reverse_complement(self):
        raise NotImplementedError("Not defined for protein.")

    rc = reverse_complement

[docs]    def reverse_complement_rna(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def transcribe(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def back_transcribe(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def seguid(self) -> str:
        """Url safe SEGUID [#]_ for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be part of a URL.

        Examples
        --------
        >>> from pydna.seq import ProteinSeq
        >>> a = ProteinSeq("aa")
        >>> a.seguid()
        'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU'

        References
        ----------
        .. [#] http://wiki.christophchamp.com/index.php/SEGUID
        """
        return _lsseguid(self._data.decode("utf8").upper(), alphabet="{protein-extended}")

    def __getitem__(self, key):
        result = super().__getitem__(key)
        try:
            result.__class__ = self.__class__
        except TypeError:
            pass
        return result

    def _pa(self) -> ProteinAnalysis:
        # breakpoint()
        return ProteinAnalysis(self._data.decode("ascii"))

[docs]    def molecular_weight(self) -> float:
        return self._pa().molecular_weight()

[docs]    def pI(self) -> float:
        return self._pa().isoelectric_point()

[docs]    def instability_index(self) -> float:
        """
        Instability index according to Guruprasad et al.

        Value above 40 means the protein is has a short half life.

        Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990).
        """
        return self._pa().instability_index()


if __name__ == "__main__":
    import os as _os

    cached = _os.getenv("pydna_cached_funcs", "")
    _os.environ["pydna_cached_funcs"] = ""
    import doctest

    doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
    _os.environ["pydna_cached_funcs"] = cached