Source code for pydna.seq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
A subclass of the Biopython SeqRecord class.

Has a number of extra methods and uses
the :class:`pydna._pretty_str.pretty_str` class instread of str for a
nicer output in the IPython shell.
"""

# from pydna.codon import weights as _weights
from Bio.SeqUtils.ProtParam import ProteinAnalysis
from pydna.codon import rare_codons as _rare_codons
from pydna.codon import start as _start
from pydna.codon import stop as _stop
from pydna.codon import n_end as _n_end
from seguid import lsseguid as _lsseguid
from pydna.utils import rc as _rc

from Bio.SeqUtils import seq3 as _seq3
from Bio.SeqUtils import gc_fraction as _GC
import re as _re
from Bio.Seq import Seq as _Seq
from pydna._pretty import PrettyTable as _PrettyTable

from typing import List as _List, Optional as _Optional, Tuple as _Tuple
import logging as _logging

_module_logger = _logging.getLogger("pydna." + __name__)


[docs]class Seq(_Seq): """docstring."""
[docs] def translate( self, *args, stop_symbol: str = "*", to_stop: bool = False, cds: bool = False, gap: str = "-", **kwargs, ) -> "ProteinSeq": """Translate..""" p = super().translate(*args, stop_symbol=stop_symbol, to_stop=to_stop, cds=cds, gap=gap, **kwargs) return ProteinSeq(p._data)
[docs] def gc(self) -> float: """Return GC content.""" return round(_GC(self._data.upper().decode("ASCII")), 3)
[docs] def cai(self, organism: str = "sce") -> float: """docstring.""" from pydna.utils import cai as _cai return _cai(self._data.upper().decode("ASCII"), organism=organism)
[docs] def rarecodons(self, organism: str = "sce") -> _List[slice]: """docstring.""" rare = _rare_codons[organism] s = self._data.upper().decode("ASCII") slices: _List[slice] = [] for i in range(0, len(self) // 3): x, y = i * 3, i * 3 + 3 trip = s[x:y] if trip in rare: slices.append(slice(x, y, 1)) return slices
[docs] def startcodon(self, organism: str = "sce") -> _Optional[float]: """docstring.""" return _start[organism].get(self._data.upper().decode("ASCII")[:3])
[docs] def stopcodon(self, organism: str = "sce") -> _Optional[float]: """docstring.""" return _stop[organism].get(self._data.upper().decode("ASCII")[-3:])
[docs] def express(self, organism: str = "sce") -> _PrettyTable: """docstring.""" x = _PrettyTable(["cds", "len", "cai", "gc", "sta", "stp", "n-end"] + _rare_codons[organism] + ["rare"]) val = [] val.append(f"{self._data.upper().decode('ASCII')[:3]}..." f"{self._data.upper().decode('ASCII')[-3:]}") val.append(len(self) / 3) val.append(self.cai(organism)) val.append(self.gc()) val.append(self.startcodon()) val.append(self.stopcodon()) val.append( _n_end[organism].get(_seq3(self[3:6].translate())), ) s = self._data.upper().decode("ASCII") trps = [s[i * 3 : i * 3 + 3] for i in range(0, len(s) // 3)] tot = 0 for cdn in _rare_codons[organism]: cnt = trps.count(cdn) tot += cnt val.append(cnt) val.append(round(tot / len(trps), 3)) x.add_row(val) return x
[docs] def orfs2(self, minsize: int = 30) -> _List[str]: """docstring.""" orf = _re.compile(f"ATG(?:...){{{minsize},}}?(?:TAG|TAA|TGA)", flags=_re.IGNORECASE) start = 0 matches: _List[slice] = [] s = self._data.decode("ASCII") while True: match = orf.search(s, pos=start) if match: matches.append(slice(match.start(), match.end())) start = match.start() + 1 else: break return sorted([self[sl] for sl in matches], key=len, reverse=True)
[docs] def orfs(self, minsize: int = 100) -> _List[_Tuple[int, int]]: dna = self._data.decode("ASCII") from pydna.utils import three_frame_orfs return [(x, y) for frame, x, y in three_frame_orfs(dna, limit=minsize)]
[docs] def seguid(self) -> str: """Url safe SEGUID [#]_ for the sequence. This checksum is the same as seguid but with base64.urlsafe encoding instead of the normal base64. This means that the characters + and / are replaced with - and _ so that the checksum can be part of a URL. Examples -------- >>> from pydna.seq import Seq >>> a = Seq("aa") >>> a.seguid() 'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU' References ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ return _lsseguid(self._data.decode("utf8").upper(), alphabet="{DNA-extended}")
def __getitem__(self, key): result = super().__getitem__(key) try: result.__class__ = self.__class__ except TypeError: pass return result
[docs] def reverse_complement(self): return self.__class__(_rc(self._data))
rc = reverse_complement
[docs]class ProteinSeq(_Seq): """docstring."""
[docs] def translate(self): raise NotImplementedError("Not defined for protein.")
[docs] def complement(self): raise NotImplementedError("Not defined for protein.")
[docs] def complement_rna(self): raise NotImplementedError("Not defined for protein.")
[docs] def reverse_complement(self): raise NotImplementedError("Not defined for protein.")
rc = reverse_complement
[docs] def reverse_complement_rna(self): raise NotImplementedError("Not defined for protein.")
[docs] def transcribe(self): raise NotImplementedError("Not defined for protein.")
[docs] def back_transcribe(self): raise NotImplementedError("Not defined for protein.")
[docs] def seguid(self) -> str: """Url safe SEGUID [#]_ for the sequence. This checksum is the same as seguid but with base64.urlsafe encoding instead of the normal base64. This means that the characters + and / are replaced with - and _ so that the checksum can be part of a URL. Examples -------- >>> from pydna.seq import ProteinSeq >>> a = ProteinSeq("aa") >>> a.seguid() 'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU' References ---------- .. [#] http://wiki.christophchamp.com/index.php/SEGUID """ return _lsseguid(self._data.decode("utf8").upper(), alphabet="{protein-extended}")
def __getitem__(self, key): result = super().__getitem__(key) try: result.__class__ = self.__class__ except TypeError: pass return result def _pa(self) -> ProteinAnalysis: # breakpoint() return ProteinAnalysis(self._data.decode("ascii"))
[docs] def molecular_weight(self) -> float: return self._pa().molecular_weight()
[docs] def pI(self) -> float: return self._pa().isoelectric_point()
[docs] def instability_index(self) -> float: """ Instability index according to Guruprasad et al. Value above 40 means the protein is has a short half life. Guruprasad K., Reddy B.V.B., Pandit M.W. Protein Engineering 4:155-161(1990). """ return self._pa().instability_index()
if __name__ == "__main__": import os as _os cached = _os.getenv("pydna_cached_funcs", "") _os.environ["pydna_cached_funcs"] = "" import doctest doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS) _os.environ["pydna_cached_funcs"] = cached