Source code for pydna.dseq

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2013-2023 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.
"""Provides the Dseq class for handling double stranded DNA sequences.

Dseq is a subclass of :class:`Bio.Seq.Seq`. The Dseq class
is mostly useful as a part of the :class:`pydna.dseqrecord.Dseqrecord` class
which can hold more meta data.

The Dseq class support the notion of circular and linear DNA topology.
"""


import copy as _copy
import itertools as _itertools
import re as _re
import sys as _sys
import math as _math

from pydna.seq import Seq as _Seq
from Bio.Seq import _translate_str, _SeqAbstractBaseClass

from pydna._pretty import pretty_str as _pretty_str
from seguid import ldseguid as _ldseguid
from seguid import cdseguid as _cdseguid

from pydna.utils import rc as _rc
from pydna.utils import flatten as _flatten
from pydna.utils import cuts_overlap as _cuts_overlap

from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly

from typing import (
    TYPE_CHECKING,
    List as _List,
    Tuple as _Tuple,
    Union as _Union,
    TypeVar as _TypeVar,
    Iterable as _Iterable,
)

if TYPE_CHECKING:
    from Bio.Restriction import AbstractCut as _AbstractCut


# To represent any subclass of Dseq
DseqType = _TypeVar("DseqType", bound="Dseq")
EnzymesType = _TypeVar("EnzymesType", _RestrictionBatch, _Iterable["_AbstractCut"], "_AbstractCut")
CutSiteType = _Tuple[_Tuple[int, int], _Union["_AbstractCut", None]]


[docs]class Dseq(_Seq): """Dseq holds information for a double stranded DNA fragment. Dseq also holds information describing the topology of the DNA fragment (linear or circular). Parameters ---------- watson : str a string representing the watson (sense) DNA strand. crick : str, optional a string representing the crick (antisense) DNA strand. ovhg : int, optional A positive or negative number to describe the stagger between the watson and crick strands. see below for a detailed explanation. linear : bool, optional True indicates that sequence is linear, False that it is circular. circular : bool, optional True indicates that sequence is circular, False that it is linear. Examples -------- Dseq is a subclass of the Biopython Seq object. It stores two strings representing the watson (sense) and crick(antisense) strands. two properties called linear and circular, and a numeric value ovhg (overhang) describing the stagger for the watson and crick strand in the 5' end of the fragment. The most common usage is probably to create a Dseq object as a part of a Dseqrecord object (see :class:`pydna.dseqrecord.Dseqrecord`). There are three ways of creating a Dseq object directly listed below, but you can also use the function Dseq.from_full_sequence_and_overhangs() to create a Dseq: Only one argument (string): >>> from pydna.dseq import Dseq >>> Dseq("aaa") Dseq(-3) aaa ttt The given string will be interpreted as the watson strand of a blunt, linear double stranded sequence object. The crick strand is created automatically from the watson strand. Two arguments (string, string): >>> from pydna.dseq import Dseq >>> Dseq("gggaaat","ttt") Dseq(-7) gggaaat ttt If both watson and crick are given, but not ovhg an attempt will be made to find the best annealing between the strands. There are limitations to this. For long fragments it is quite slow. The length of the annealing sequences have to be at least half the length of the shortest of the strands. Three arguments (string, string, ovhg=int): The ovhg parameter is an integer describing the length of the crick strand overhang in the 5' end of the molecule. The ovhg parameter controls the stagger at the five prime end:: dsDNA overhang nnn... 2 nnnnn... nnnn... 1 nnnnn... nnnnn... 0 nnnnn... nnnnn... -1 nnnn... nnnnn... -2 nnn... Example of creating Dseq objects with different amounts of stagger: >>> Dseq(watson="agt", crick="actta", ovhg=-2) Dseq(-7) agt attca >>> Dseq(watson="agt",crick="actta",ovhg=-1) Dseq(-6) agt attca >>> Dseq(watson="agt",crick="actta",ovhg=0) Dseq(-5) agt attca >>> Dseq(watson="agt",crick="actta",ovhg=1) Dseq(-5) agt attca >>> Dseq(watson="agt",crick="actta",ovhg=2) Dseq(-5) agt attca If the ovhg parameter is specified a crick strand also needs to be supplied, otherwise an exception is raised. >>> Dseq(watson="agt", ovhg=2) Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.7/dist-packages/pydna_/dsdna.py", line 169, in __init__ else: ValueError: ovhg defined without crick strand! The shape of the fragment is set by circular = True, False Note that both ends of the DNA fragment has to be compatible to set circular = True. >>> Dseq("aaa","ttt") Dseq(-3) aaa ttt >>> Dseq("aaa","ttt",ovhg=0) Dseq(-3) aaa ttt >>> Dseq("aaa","ttt",ovhg=1) Dseq(-4) aaa ttt >>> Dseq("aaa","ttt",ovhg=-1) Dseq(-4) aaa ttt >>> Dseq("aaa", "ttt", circular = True , ovhg=0) Dseq(o3) aaa ttt >>> a=Dseq("tttcccc","aaacccc") >>> a Dseq(-11) tttcccc ccccaaa >>> a.ovhg 4 >>> b=Dseq("ccccttt","ccccaaa") >>> b Dseq(-11) ccccttt aaacccc >>> b.ovhg -4 >>> Coercing to string >>> str(a) 'ggggtttcccc' A Dseq object can be longer that either the watson or crick strands. :: <-- length --> GATCCTTT AAAGCCTAG <-- length --> GATCCTTT AAAGCCCTA The slicing of a linear Dseq object works mostly as it does for a string. >>> s="ggatcc" >>> s[2:3] 'a' >>> s[2:4] 'at' >>> s[2:4:-1] '' >>> s[::2] 'gac' >>> from pydna.dseq import Dseq >>> d=Dseq(s, circular=False) >>> d[2:3] Dseq(-1) a t >>> d[2:4] Dseq(-2) at ta >>> d[2:4:-1] Dseq(-0) <BLANKLINE> <BLANKLINE> >>> d[::2] Dseq(-3) gac ctg The slicing of a circular Dseq object has a slightly different meaning. >>> s="ggAtCc" >>> d=Dseq(s, circular=True) >>> d Dseq(o6) ggAtCc ccTaGg >>> d[4:3] Dseq(-5) CcggA GgccT The slice [X:X] produces an empty slice for a string, while this will return the linearized sequence starting at X: >>> s="ggatcc" >>> d=Dseq(s, circular=True) >>> d Dseq(o6) ggatcc cctagg >>> d[3:3] Dseq(-6) tccgga aggcct >>> See Also -------- pydna.dseqrecord.Dseqrecord """ trunc = 30 def __init__( self, watson: _Union[str, bytes], crick: _Union[str, bytes, None] = None, ovhg=None, circular=False, pos=0, ): if isinstance(watson, bytes): watson = watson.decode("ASCII") if isinstance(crick, bytes): crick = crick.decode("ASCII") if crick is None: if ovhg is not None: raise ValueError("ovhg defined without crick strand!") crick = _rc(watson) ovhg = 0 self._data = bytes(watson, encoding="ASCII") else: # crick strand given if ovhg is None: # ovhg not given olaps = _common_sub_strings( str(watson).lower(), str(_rc(crick).lower()), int(_math.log(len(watson)) / _math.log(4)), ) if len(olaps) == 0: raise ValueError("Could not anneal the two strands." " Please provide ovhg value") # We extract the positions and length of the first (longest) overlap, since # common_sub_strings sorts the overlaps by length. pos_watson, pos_crick, longest_olap_length = olaps[0] # We see if there is another overlap of the same length if any(olap[2] >= longest_olap_length for olap in olaps[1:]): raise ValueError("More than one way of annealing the" " strands. Please provide ovhg value") ovhg = pos_crick - pos_watson sns = (ovhg * " ") + _pretty_str(watson) asn = (-ovhg * " ") + _pretty_str(_rc(crick)) self._data = bytes( "".join([a.strip() or b.strip() for a, b in _itertools.zip_longest(sns, asn, fillvalue=" ")]), encoding="ASCII", ) else: # ovhg given if ovhg == 0: if len(watson) >= len(crick): self._data = bytes(watson, encoding="ASCII") else: self._data = bytes( watson + _rc(crick[: len(crick) - len(watson)]), encoding="ASCII", ) elif ovhg > 0: if ovhg + len(watson) > len(crick): self._data = bytes(_rc(crick[-ovhg:]) + watson, encoding="ASCII") else: self._data = bytes( _rc(crick[-ovhg:]) + watson + _rc(crick[: len(crick) - ovhg - len(watson)]), encoding="ASCII", ) else: # ovhg < 0 if -ovhg + len(crick) > len(watson): self._data = bytes( watson + _rc(crick[: -ovhg + len(crick) - len(watson)]), encoding="ASCII", ) else: self._data = bytes(watson, encoding="ASCII") self.circular = circular self.watson = _pretty_str(watson) self.crick = _pretty_str(crick) self.length = len(self._data) self.ovhg = ovhg self.pos = pos
[docs] @classmethod def quick( cls, watson: str, crick: str, ovhg=0, circular=False, pos=0, ): obj = cls.__new__(cls) # Does not call __init__ obj.watson = _pretty_str(watson) obj.crick = _pretty_str(crick) obj.ovhg = ovhg obj.circular = circular obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = pos wb = bytes(watson, encoding="ASCII") cb = bytes(crick, encoding="ASCII") obj._data = _rc(cb[-max(0, ovhg) or len(cb) :]) + wb + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) return obj
[docs] @classmethod def from_string( cls, dna: str, *args, # linear=True, circular=False, **kwargs, ): obj = cls.__new__(cls) # Does not call __init__ obj.watson = _pretty_str(dna) obj.crick = _pretty_str(_rc(dna)) obj.ovhg = 0 obj.circular = circular # obj._linear = linear obj.length = len(dna) obj.pos = 0 obj._data = bytes(dna, encoding="ASCII") return obj
[docs] @classmethod def from_representation(cls, dsdna: str, *args, **kwargs): obj = cls.__new__(cls) # Does not call __init__ w, c, *r = [ln for ln in dsdna.splitlines() if ln] ovhg = obj.ovhg = len(w) - len(w.lstrip()) - (len(c) - len(c.lstrip())) watson = obj.watson = _pretty_str(w.strip()) crick = obj.crick = _pretty_str(c.strip()[::-1]) obj.circular = False # obj._linear = True obj.length = max(len(watson) + max(0, ovhg), len(crick) + max(0, -ovhg)) obj.pos = 0 wb = bytes(watson, encoding="ASCII") cb = bytes(crick, encoding="ASCII") obj._data = _rc(cb[-max(0, ovhg) or len(cb) :]) + wb + _rc(cb[: max(0, len(cb) - ovhg - len(wb))]) return obj
[docs] @classmethod def from_full_sequence_and_overhangs(cls, full_sequence: str, crick_ovhg: int, watson_ovhg: int): """Create a linear Dseq object from a full sequence and the 3' overhangs of each strand. The order of the parameters is like this because the 3' overhang of the crick strand is the one on the left side of the sequence. Parameters ---------- full_sequence: str The full sequence of the Dseq object. crick_ovhg: int The overhang of the crick strand in the 3' end. Equivalent to Dseq.ovhg. watson_ovhg: int The overhang of the watson strand in the 5' end. Returns ------- Dseq A Dseq object. Examples -------- >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=2) Dseq(-6) AAAA TTTT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=2) Dseq(-6) AAAAAA TT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=2, watson_ovhg=-2) Dseq(-6) AA TTTTTT >>> Dseq.from_full_sequence_and_overhangs('AAAAAA', crick_ovhg=-2, watson_ovhg=-2) Dseq(-6) AAAA TTTT """ full_sequence_rev = str(Dseq(full_sequence).reverse_complement()) watson = full_sequence crick = full_sequence_rev # If necessary, we trim the left side if crick_ovhg < 0: crick = crick[:crick_ovhg] elif crick_ovhg > 0: watson = watson[crick_ovhg:] # If necessary, we trim the right side if watson_ovhg < 0: watson = watson[:watson_ovhg] elif watson_ovhg > 0: crick = crick[watson_ovhg:] return Dseq(watson, crick=crick, ovhg=crick_ovhg)
# @property # def ovhg(self): # """The ovhg property. This cannot be set directly, but is a # consequence of how the watson and crick strands anneal to # each other""" # return self._ovhg # @property # def linear(self): # """The linear property can not be set directly. # Use an empty slice [:] to create a linear object.""" # return self._linear # @property # def circular(self): # """The circular property can not be set directly. # Use :meth:`looped` to create a circular Dseq object""" # return self._circular
[docs] def mw(self) -> float: """This method returns the molecular weight of the DNA molecule in g/mol. The following formula is used:: MW = (A x 313.2) + (T x 304.2) + (C x 289.2) + (G x 329.2) + (N x 308.9) + 79.0 """ nts = (self.watson + self.crick).lower() return ( 313.2 * nts.count("a") + 304.2 * nts.count("t") + 289.2 * nts.count("c") + 329.2 * nts.count("g") + 308.9 * nts.count("n") + 79.0 )
[docs] def upper(self: DseqType) -> DseqType: """Return an upper case copy of the sequence. >>> from pydna.dseq import Dseq >>> my_seq = Dseq("aAa") >>> my_seq Dseq(-3) aAa tTt >>> my_seq.upper() Dseq(-3) AAA TTT Returns ------- Dseq Dseq object in uppercase See also -------- pydna.dseq.Dseq.lower """ return self.quick( self.watson.upper(), self.crick.upper(), ovhg=self.ovhg, # linear=self.linear, circular=self.circular, pos=self.pos, )
[docs] def lower(self: DseqType) -> DseqType: """Return a lower case copy of the sequence. >>> from pydna.dseq import Dseq >>> my_seq = Dseq("aAa") >>> my_seq Dseq(-3) aAa tTt >>> my_seq.lower() Dseq(-3) aaa ttt Returns ------- Dseq Dseq object in lowercase See also -------- pydna.dseq.Dseq.upper """ return self.quick( self.watson.lower(), self.crick.lower(), ovhg=self.ovhg, # linear=self.linear, circular=self.circular, pos=self.pos, )
[docs] def find(self, sub: _Union[_SeqAbstractBaseClass, str, bytes], start=0, end=_sys.maxsize) -> int: """This method behaves like the python string method of the same name. Returns an integer, the index of the first occurrence of substring argument sub in the (sub)sequence given by [start:end]. Returns -1 if the subsequence is NOT found. Parameters ---------- sub : string or Seq object a string or another Seq object to look for. start : int, optional slice start. end : int, optional slice end. Examples -------- >>> from pydna.dseq import Dseq >>> seq = Dseq("atcgactgacgtgtt") >>> seq Dseq(-15) atcgactgacgtgtt tagctgactgcacaa >>> seq.find("gac") 3 >>> seq = Dseq(watson="agt",crick="actta",ovhg=-2) >>> seq Dseq(-7) agt attca >>> seq.find("taa") 2 """ if not self.circular: return _Seq.find(self, sub, start, end) return (_pretty_str(self) + _pretty_str(self)).find(sub, start, end)
def __getitem__(self, sl: slice) -> "Dseq": """Returns a subsequence. This method is used by the slice notation""" if not self.circular: x = len(self.crick) - self.ovhg - len(self.watson) sns = (self.ovhg * " " + self.watson + x * " ")[sl] asn = (-self.ovhg * " " + self.crick[::-1] + -x * " ")[sl] ovhg = max((len(sns) - len(sns.lstrip()), -len(asn) + len(asn.lstrip())), key=abs) return Dseq( sns.strip(), asn[::-1].strip(), ovhg=ovhg, # linear=True ) else: sl = slice(sl.start or 0, sl.stop or len(self), sl.step) if sl.start > len(self) or sl.stop > len(self): return Dseq("") if sl.start < sl.stop: return Dseq( self.watson[sl], self.crick[::-1][sl][::-1], ovhg=0, # linear=True ) else: try: stp = abs(sl.step) except TypeError: stp = 1 start = sl.start stop = sl.stop w = self.watson[(start or len(self)) :: stp] + self.watson[: (stop or 0) : stp] c = self.crick[len(self) - stop :: stp] + self.crick[: len(self) - start : stp] return Dseq(w, c, ovhg=0) # , linear=True) def __eq__(self, other: DseqType) -> bool: """Compare to another Dseq object OR an object that implements watson, crick and ovhg properties. This comparison is case insensitive. """ try: same = ( other.watson.lower() == self.watson.lower() and other.crick.lower() == self.crick.lower() and other.ovhg == self.ovhg and self.circular == other.circular ) # Also test for alphabet ? except AttributeError: same = False return same def __repr__(self): """Returns a representation of the sequence, truncated if longer than 30 bp""" if len(self) > Dseq.trunc: if self.ovhg > 0: d = self.crick[-self.ovhg :][::-1] hej = len(d) if len(d) > 10: d = "{}..{}".format(d[:4], d[-4:]) a = len(d) * " " elif self.ovhg < 0: a = self.watson[: max(0, -self.ovhg)] hej = len(a) if len(a) > 10: a = "{}..{}".format(a[:4], a[-4:]) d = len(a) * " " else: a = "" d = "" hej = 0 x = self.ovhg + len(self.watson) - len(self.crick) if x > 0: c = self.watson[len(self.crick) - self.ovhg :] y = len(c) if len(c) > 10: c = "{}..{}".format(c[:4], c[-4:]) f = len(c) * " " elif x < 0: f = self.crick[:-x][::-1] y = len(f) if len(f) > 10: f = "{}..{}".format(f[:4], f[-4:]) c = len(f) * " " else: c = "" f = "" y = 0 L = len(self) - hej - y x1 = -min(0, self.ovhg) x2 = x1 + L x3 = -min(0, x) x4 = x3 + L b = self.watson[x1:x2] e = self.crick[x3:x4][::-1] if len(b) > 10: b = "{}..{}".format(b[:4], b[-4:]) e = "{}..{}".format(e[:4], e[-4:]) return _pretty_str("{klass}({top}{size})\n" "{a}{b}{c}\n" "{d}{e}{f}").format( klass=self.__class__.__name__, top={False: "-", True: "o"}[self.circular], size=len(self), a=a, b=b, c=c, d=d, e=e, f=f, ) else: return _pretty_str( "{}({}{})\n{}\n{}".format( self.__class__.__name__, {False: "-", True: "o"}[self.circular], len(self), self.ovhg * " " + self.watson, -self.ovhg * " " + self.crick[::-1], ) )
[docs] def reverse_complement(self) -> "Dseq": """Dseq object where watson and crick have switched places. This represents the same double stranded sequence. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc") >>> a Dseq(-8) catcgatc gtagctag >>> b=a.reverse_complement() >>> b Dseq(-8) gatcgatg ctagctac >>> """ return Dseq.quick( self.crick, self.watson, ovhg=len(self.watson) - len(self.crick) + self.ovhg, circular=self.circular, )
rc = reverse_complement # alias for reverse_complement
[docs] def shifted(self: DseqType, shift: int) -> DseqType: """Shifted version of a circular Dseq object.""" if not self.circular: raise TypeError("DNA is not circular.") shift = shift % len(self) if not shift: return _copy.deepcopy(self) else: return (self[shift:] + self[:shift]).looped()
[docs] def looped(self: DseqType) -> DseqType: """Circularized Dseq object. This can only be done if the two ends are compatible, otherwise a TypeError is raised. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc") >>> a Dseq(-8) catcgatc gtagctag >>> a.looped() Dseq(o8) catcgatc gtagctag >>> a.T4("t") Dseq(-8) catcgat tagctag >>> a.T4("t").looped() Dseq(o7) catcgat gtagcta >>> a.T4("a") Dseq(-8) catcga agctag >>> a.T4("a").looped() Traceback (most recent call last): File "<stdin>", line 1, in <module> File "/usr/local/lib/python2.7/dist-packages/pydna/dsdna.py", line 357, in looped if type5 == type3 and str(sticky5) == str(rc(sticky3)): TypeError: DNA cannot be circularized. 5' and 3' sticky ends not compatible! >>> """ if self.circular: return _copy.deepcopy(self) type5, sticky5 = self.five_prime_end() type3, sticky3 = self.three_prime_end() if type5 == type3 and str(sticky5) == str(_rc(sticky3)): nseq = self.__class__.quick( self.watson, self.crick[-self.ovhg :] + self.crick[: -self.ovhg], ovhg=0, # linear=False, circular=True, ) # assert len(nseq.crick) == len(nseq.watson) return nseq else: raise TypeError("DNA cannot be circularized.\n" "5' and 3' sticky ends not compatible!")
[docs] def tolinear(self: DseqType) -> DseqType: # pragma: no cover """Returns a blunt, linear copy of a circular Dseq object. This can only be done if the Dseq object is circular, otherwise a TypeError is raised. This method is deprecated, use slicing instead. See example below. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("catcgatc", circular=True) >>> a Dseq(o8) catcgatc gtagctag >>> a[:] Dseq(-8) catcgatc gtagctag >>> """ import warnings as _warnings from pydna import _PydnaDeprecationWarning _warnings.warn( "tolinear method is obsolete; " "please use obj[:] " "instead of obj.tolinear().", _PydnaDeprecationWarning, ) if not self.circular: raise TypeError("DNA is not circular.\n") selfcopy = _copy.deepcopy(self) selfcopy.circular = False return selfcopy # self.__class__(self.watson, linear=True)
[docs] def five_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of the DNA fragment Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.five_prime_end() ('blunt', '') >>> a=Dseq("aaa", "ttt", ovhg=1) >>> a Dseq(-4) aaa ttt >>> a.five_prime_end() ("3'", 't') >>> a=Dseq("aaa", "ttt", ovhg=-1) >>> a Dseq(-4) aaa ttt >>> a.five_prime_end() ("5'", 'a') >>> See also -------- pydna.dseq.Dseq.three_prime_end """ if self.watson and not self.crick: return "5'", self.watson.lower() if not self.watson and self.crick: return "3'", self.crick.lower() if self.ovhg < 0: sticky = self.watson[: -self.ovhg].lower() type_ = "5'" elif self.ovhg > 0: sticky = self.crick[-self.ovhg :].lower() type_ = "3'" else: sticky = "" type_ = "blunt" return type_, sticky
[docs] def three_prime_end(self) -> _Tuple[str, str]: """Returns a tuple describing the structure of the 5' end of the DNA fragment >>> from pydna.dseq import Dseq >>> a=Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.three_prime_end() ('blunt', '') >>> a=Dseq("aaa", "ttt", ovhg=1) >>> a Dseq(-4) aaa ttt >>> a.three_prime_end() ("3'", 'a') >>> a=Dseq("aaa", "ttt", ovhg=-1) >>> a Dseq(-4) aaa ttt >>> a.three_prime_end() ("5'", 't') >>> See also -------- pydna.dseq.Dseq.five_prime_end """ ovhg = len(self.watson) - len(self.crick) + self.ovhg if ovhg < 0: sticky = self.crick[:-ovhg].lower() type_ = "5'" elif ovhg > 0: sticky = self.watson[-ovhg:].lower() type_ = "3'" else: sticky = "" type_ = "blunt" return type_, sticky
[docs] def watson_ovhg(self) -> int: """Returns the overhang of the watson strand at the three prime.""" return len(self.watson) - len(self.crick) + self.ovhg
def __add__(self: DseqType, other: DseqType) -> DseqType: """Simulates ligation between two DNA fragments. Add other Dseq object at the end of the sequence. Type error is raised if any of the points below are fulfilled: * one or more objects are circular * if three prime sticky end of self is not the same type (5' or 3') as the sticky end of other * three prime sticky end of self complementary with five prime sticky end of other. Phosphorylation and dephosphorylation is not considered. DNA is allways presumed to have the necessary 5' phospate group necessary for ligation. """ # test for circular DNA if self.circular: raise TypeError("circular DNA cannot be ligated!") try: if other.circular: raise TypeError("circular DNA cannot be ligated!") except AttributeError: pass self_type, self_tail = self.three_prime_end() other_type, other_tail = other.five_prime_end() if self_type == other_type and str(self_tail) == str(_rc(other_tail)): answer = Dseq.quick(self.watson + other.watson, other.crick + self.crick, self.ovhg) elif not self: answer = _copy.deepcopy(other) elif not other: answer = _copy.deepcopy(self) else: raise TypeError("sticky ends not compatible!") return answer def __mul__(self: DseqType, number: int) -> DseqType: if not isinstance(number, int): raise TypeError("TypeError: can't multiply Dseq by non-int of type {}".format(type(number))) if number <= 0: return self.__class__("") new = _copy.deepcopy(self) for i in range(number - 1): new += self return new def _fill_in_five_prime(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.five_prime_end() if type == "5'": for n in _rc(se): if n in nucleotides: stuffer += n else: break return self.crick + stuffer, self.ovhg + len(stuffer) def _fill_in_three_prime(self: DseqType, nucleotides: str) -> str: stuffer = "" type, se = self.three_prime_end() if type == "5'": for n in _rc(se): if n in nucleotides: stuffer += n else: break return self.watson + stuffer
[docs] def fill_in(self, nucleotides: _Union[None, str] = None) -> "Dseq": """Fill in of five prime protruding end with a DNA polymerase that has only DNA polymerase activity (such as exo-klenow [#]_) and any combination of A, G, C or T. Default are all four nucleotides together. Parameters ---------- nucleotides : str Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("aaa", "ttt") >>> a Dseq(-3) aaa ttt >>> a.fill_in() Dseq(-3) aaa ttt >>> b=Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.fill_in() Dseq(-5) caaag gtttc >>> b.fill_in("g") Dseq(-5) caaag gtttc >>> b.fill_in("tac") Dseq(-5) caaa tttc >>> c=Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.fill_in() Dseq(-5) aaac gttt >>> References ---------- .. [#] http://en.wikipedia.org/wiki/Klenow_fragment#The_exo-_Klenow_fragment """ if nucleotides is None: nucleotides = "GATCRYWSMKHBVDN" nucleotides = set(nucleotides.lower() + nucleotides.upper()) crick, ovhg = self._fill_in_five_prime(nucleotides) watson = self._fill_in_three_prime(nucleotides) return Dseq(watson, crick, ovhg)
[docs] def transcribe(self) -> _Seq: return _Seq(self.watson).transcribe()
[docs] def translate(self, table="Standard", stop_symbol="*", to_stop=False, cds=False, gap="-") -> _Seq: return _Seq(_translate_str(str(self), table, stop_symbol, to_stop, cds, gap=gap))
[docs] def mung(self) -> "Dseq": """ Simulates treatment a nuclease with 5'-3' and 3'-5' single strand specific exonuclease activity (such as mung bean nuclease [#]_) :: ggatcc -> gatcc ctaggg ctagg ggatcc -> ggatc tcctag cctag >>> from pydna.dseq import Dseq >>> b=Dseq("caaa", "cttt") >>> b Dseq(-5) caaa tttc >>> b.mung() Dseq(-3) aaa ttt >>> c=Dseq("aaac", "tttg") >>> c Dseq(-5) aaac gttt >>> c.mung() Dseq(-3) aaa ttt References ---------- .. [#] http://en.wikipedia.org/wiki/Mung_bean_nuclease """ return Dseq(self.watson[max(0, -self.ovhg) : min(len(self.watson), len(self.crick) - self.ovhg)])
[docs] def T4(self, nucleotides=None) -> "Dseq": """Fill in five prime protruding ends and chewing back three prime protruding ends by a DNA polymerase providing both 5'-3' DNA polymerase activity and 3'-5' nuclease acitivty (such as T4 DNA polymerase). This can be done in presence of any combination of the four A, G, C or T. Removing one or more nucleotides can facilitate engineering of sticky ends. Default are all four nucleotides together. Parameters ---------- nucleotides : str Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("gatcgatc") >>> a Dseq(-8) gatcgatc ctagctag >>> a.T4() Dseq(-8) gatcgatc ctagctag >>> a.T4("t") Dseq(-8) gatcgat tagctag >>> a.T4("a") Dseq(-8) gatcga agctag >>> a.T4("g") Dseq(-8) gatcg gctag >>> """ if not nucleotides: nucleotides = "GATCRYWSMKHBVDN" nucleotides = set(nucleotides.lower() + nucleotides.upper()) type, se = self.five_prime_end() if type == "5'": crick, ovhg = self._fill_in_five_prime(nucleotides) else: if type == "3'": ovhg = 0 crick = self.crick[: -len(se)] else: ovhg = 0 crick = self.crick x = len(crick) - 1 while x >= 0: if crick[x] in nucleotides: break x -= 1 ovhg = x - len(crick) + 1 + ovhg crick = crick[: x + 1] if not crick: ovhg = 0 watson = self.watson type, se = self.three_prime_end() if type == "5'": watson = self._fill_in_three_prime(nucleotides) else: if type == "3'": watson = self.watson[: -len(se)] x = len(watson) - 1 while x >= 0: if watson[x] in nucleotides: break x -= 1 watson = watson[: x + 1] return Dseq(watson, crick, ovhg)
t4 = T4 # alias for the T4 method.
[docs] def exo1_front(self: DseqType, n=1) -> DseqType: """5'-3' resection at the start (left side) of the molecule.""" d = _copy.deepcopy(self) d.ovhg += n d.watson = d.watson[n:] return d
[docs] def exo1_end(self: DseqType, n=1) -> DseqType: """5'-3' resection at the end (right side) of the molecule.""" d = _copy.deepcopy(self) d.crick = d.crick[n:] return d
[docs] def no_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch not cutting sequence.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if not sitelist} return _RestrictionBatch(ncut)
[docs] def unique_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence once.""" if batch is None: batch = CommOnly return self.n_cutters(n=1, batch=batch)
once_cutters = unique_cutters # alias for unique_cutters
[docs] def twice_cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence twice.""" if batch is None: batch = CommOnly return self.n_cutters(n=2, batch=batch)
[docs] def n_cutters(self, n=3, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting n times.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if len(sitelist) == n} return _RestrictionBatch(ncut)
[docs] def cutters(self, batch: _Union[_RestrictionBatch, None] = None) -> _RestrictionBatch: """Enzymes in a RestrictionBatch cutting sequence at least once.""" if batch is None: batch = CommOnly ana = batch.search(self) ncut = {enz: sitelist for (enz, sitelist) in ana.items() if sitelist} return _RestrictionBatch(ncut)
[docs] def seguid(self) -> str: """SEGUID checksum for the sequence.""" if self.circular: cs = _cdseguid(self.watson.upper(), self.crick.upper(), alphabet="{DNA-extended}") else: """docstring.""" w = f"{self.ovhg * '-'}{self.watson}{'-' * (-self.ovhg + len(self.crick) - len(self.watson))}".upper() c = f"{'-' * (self.ovhg + len(self.watson) - len(self.crick))}{self.crick}{-self.ovhg * '-'}".upper() cs = _ldseguid(w, c, alphabet="{DNA-extended}") return cs
[docs] def isblunt(self) -> bool: """isblunt. Return True if Dseq is linear and blunt and false if staggered or circular. Examples -------- >>> from pydna.dseq import Dseq >>> a=Dseq("gat") >>> a Dseq(-3) gat cta >>> a.isblunt() True >>> a=Dseq("gat", "atcg") >>> a Dseq(-4) gat gcta >>> a.isblunt() False >>> a=Dseq("gat", "gatc") >>> a Dseq(-4) gat ctag >>> a.isblunt() False >>> a=Dseq("gat", circular=True) >>> a Dseq(o3) gat cta >>> a.isblunt() False """ return self.ovhg == 0 and len(self.watson) == len(self.crick) and not self.circular
[docs] def cas9(self, RNA: str) -> _Tuple[slice, ...]: """docstring.""" bRNA = bytes(RNA, "ASCII") slices = [] cuts = [0] for m in _re.finditer(bRNA, self._data): cuts.append(m.start() + 17) cuts.append(self.length) slices = tuple(slice(x, y, 1) for x, y in zip(cuts, cuts[1:])) return slices
[docs] def terminal_transferase(self, nucleotides="a") -> "Dseq": """docstring.""" ovhg = self.ovhg if self.ovhg >= 0: ovhg += len(nucleotides) return Dseq(self.watson + nucleotides, self.crick + nucleotides, ovhg)
[docs] def cut(self: DseqType, *enzymes: EnzymesType) -> _Tuple[DseqType, ...]: """Returns a list of linear Dseq fragments produced in the digestion. If there are no cuts, an empty list is returned. Parameters ---------- enzymes : enzyme object or iterable of such objects A Bio.Restriction.XXX restriction objects or iterable. Returns ------- frags : list list of Dseq objects formed by the digestion Examples -------- >>> from pydna.dseq import Dseq >>> seq=Dseq("ggatccnnngaattc") >>> seq Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> from Bio.Restriction import BamHI,EcoRI >>> type(seq.cut(BamHI)) <class 'tuple'> >>> for frag in seq.cut(BamHI): print(repr(frag)) Dseq(-5) g cctag Dseq(-14) gatccnnngaattc gnnncttaag >>> seq.cut(EcoRI, BamHI) == seq.cut(BamHI, EcoRI) True >>> a,b,c = seq.cut(EcoRI, BamHI) >>> a+b+c Dseq(-15) ggatccnnngaattc cctaggnnncttaag >>> """ cutsites = self.get_cutsites(*enzymes) cutsite_pairs = self.get_cutsite_pairs(cutsites) return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
[docs] def cutsite_is_valid(self, cutsite: CutSiteType) -> bool: """Returns False if: - Cut positions fall outside the sequence (could be moved to Biopython) - Overhang is not double stranded - Recognition site is not double stranded or is outside the sequence - For enzymes that cut twice, it checks that at least one possibility is valid """ assert cutsite is not None, "cutsite is None" enz = cutsite[1] watson, crick, ovhg = self.get_cut_parameters(cutsite, True) # The cut positions fall within the sequence # This could go into Biopython if not self.circular and crick < 0 or crick > len(self): return False # The overhang is double stranded overhang_dseq = self[watson:crick] if ovhg < 0 else self[crick:watson] if overhang_dseq.ovhg != 0 or overhang_dseq.watson_ovhg() != 0: return False # The recognition site is double stranded and within the sequence start_of_recognition_site = watson - enz.fst5 if start_of_recognition_site < 0: start_of_recognition_site += len(self) end_of_recognition_site = start_of_recognition_site + enz.size if self.circular: end_of_recognition_site %= len(self) recognition_site = self[start_of_recognition_site:end_of_recognition_site] if len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0: if enz is None or enz.scd5 is None: return False else: # For enzymes that cut twice, this might be referring to the second one start_of_recognition_site = watson - enz.scd5 if start_of_recognition_site < 0: start_of_recognition_site += len(self) end_of_recognition_site = start_of_recognition_site + enz.size if self.circular: end_of_recognition_site %= len(self) recognition_site = self[start_of_recognition_site:end_of_recognition_site] if len(recognition_site) == 0 or recognition_site.ovhg != 0 or recognition_site.watson_ovhg() != 0: return False return True
[docs] def get_cutsites(self: DseqType, *enzymes: EnzymesType) -> _List[CutSiteType]: """Returns a list of cutsites, represented represented as `((cut_watson, ovhg), enz)`: - `cut_watson` is a positive integer contained in `[0,len(seq))`, where `seq` is the sequence that will be cut. It represents the position of the cut on the watson strand, using the full sequence as a reference. By "full sequence" I mean the one you would get from `str(Dseq)`. - `ovhg` is the overhang left after the cut. It has the same meaning as `ovhg` in the `Bio.Restriction` enzyme objects, or pydna's `Dseq` property. - `enz` is the enzyme object. It's not necessary to perform the cut, but can be used to keep track of which enzyme was used. Cuts are only returned if the recognition site and overhang are on the double-strand part of the sequence. Parameters ---------- enzymes : Union[_RestrictionBatch,list[_AbstractCut]] Returns ------- list[tuple[tuple[int,int], _AbstractCut]] Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> seq = Dseq('AAGAATTCAAGAATTC') >>> seq.get_cutsites(EcoRI) [((3, -4), EcoRI), ((11, -4), EcoRI)] `cut_watson` is defined with respect to the "full sequence", not the watson strand: >>> dseq = Dseq.from_full_sequence_and_overhangs('aaGAATTCaa', 1, 0) >>> dseq Dseq(-10) aGAATTCaa ttCTTAAGtt >>> dseq.get_cutsites([EcoRI]) [((3, -4), EcoRI)] Cuts are only returned if the recognition site and overhang are on the double-strand part of the sequence. >>> Dseq('GAATTC').get_cutsites([EcoRI]) [((1, -4), EcoRI)] >>> Dseq.from_full_sequence_and_overhangs('GAATTC', -1, 0).get_cutsites([EcoRI]) [] """ if len(enzymes) == 1 and isinstance(enzymes[0], _RestrictionBatch): # argument is probably a RestrictionBatch enzymes = [e for e in enzymes[0]] enzymes = _flatten(enzymes) out = list() for e in enzymes: # Positions of the cut on the watson strand. They are 1-based, so we subtract # 1 to get 0-based positions cuts_watson = [c - 1 for c in e.search(self, linear=(not self.circular))] out += [((w, e.ovhg), e) for w in cuts_watson] return sorted([cutsite for cutsite in out if self.cutsite_is_valid(cutsite)])
[docs] def left_end_position(self) -> _Tuple[int, int]: """ The index in the full sequence of the watson and crick start positions. full sequence (str(self)) for all three cases is AAA :: AAA AA AAT TT TTT TTT Returns (0, 1) Returns (1, 0) Returns (0, 0) """ if self.ovhg > 0: return self.ovhg, 0 return 0, -self.ovhg
[docs] def right_end_position(self) -> _Tuple[int, int]: """The index in the full sequence of the watson and crick end positions. full sequence (str(self)) for all three cases is AAA ``` AAA AA AAA TT TTT TTT Returns (3, 2) Returns (2, 3) Returns (3, 3) ``` """ if self.watson_ovhg() < 0: return len(self) + self.watson_ovhg(), len(self) return len(self), len(self) - self.watson_ovhg()
[docs] def get_cut_parameters(self, cut: _Union[CutSiteType, None], is_left: bool) -> _Tuple[int, int, int]: """For a given cut expressed as ((cut_watson, ovhg), enz), returns a tuple (cut_watson, cut_crick, ovhg). - cut_watson: see get_cutsites docs - cut_crick: equivalent of cut_watson in the crick strand - ovhg: see get_cutsites docs The cut can be None if it represents the left or right end of the sequence. Then it will return the position of the watson and crick ends with respect to the "full sequence". The `is_left` parameter is only used in this case. """ if cut is not None: watson, ovhg = cut[0] crick = watson - ovhg if self.circular: crick %= len(self) return watson, crick, ovhg assert not self.circular, "Circular sequences should not have None cuts" if is_left: return *self.left_end_position(), self.ovhg # In the right end, the overhang does not matter return *self.right_end_position(), self.watson_ovhg()
[docs] def apply_cut(self, left_cut: CutSiteType, right_cut: CutSiteType) -> "Dseq": """Extracts a subfragment of the sequence between two cuts. For more detail see the documentation of get_cutsite_pairs. Parameters ---------- left_cut : Union[tuple[tuple[int,int], _AbstractCut], None] right_cut: Union[tuple[tuple[int,int], _AbstractCut], None] Returns ------- Dseq Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> dseq = Dseq('aaGAATTCaaGAATTCaa') >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((3, -4), EcoRI), ((11, -4), EcoRI)] >>> p1, p2, p3 = dseq.get_cutsite_pairs(cutsites) >>> p1 (None, ((3, -4), EcoRI)) >>> dseq.apply_cut(*p1) Dseq(-7) aaG ttCTTAA >>> p2 (((3, -4), EcoRI), ((11, -4), EcoRI)) >>> dseq.apply_cut(*p2) Dseq(-12) AATTCaaG GttCTTAA >>> p3 (((11, -4), EcoRI), None) >>> dseq.apply_cut(*p3) Dseq(-7) AATTCaa Gtt >>> dseq = Dseq('TTCaaGAA', circular=True) >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((6, -4), EcoRI)] >>> pair = dseq.get_cutsite_pairs(cutsites)[0] >>> pair (((6, -4), EcoRI), ((6, -4), EcoRI)) >>> dseq.apply_cut(*pair) Dseq(-12) AATTCaaG GttCTTAA """ if _cuts_overlap(left_cut, right_cut, len(self)): raise ValueError("Cuts by {} {} overlap.".format(left_cut[1], right_cut[1])) left_watson, left_crick, ovhg_left = self.get_cut_parameters(left_cut, True) right_watson, right_crick, _ = self.get_cut_parameters(right_cut, False) return Dseq( str(self[left_watson:right_watson]), # The line below could be easier to understand as _rc(str(self[left_crick:right_crick])), but it does not preserve the case str(self.reverse_complement()[len(self) - right_crick : len(self) - left_crick]), ovhg=ovhg_left, )
[docs] def get_cutsite_pairs( self, cutsites: _List[CutSiteType] ) -> _List[_Tuple[_Union[None, CutSiteType], _Union[None, CutSiteType]]]: """Returns pairs of cutsites that render the edges of the resulting fragments. A fragment produced by restriction is represented by a tuple of length 2 that may contain cutsites or `None`: - Two cutsites: represents the extraction of a fragment between those two cutsites, in that orientation. To represent the opening of a circular molecule with a single cutsite, we put the same cutsite twice. - `None`, cutsite: represents the extraction of a fragment between the left edge of linear sequence and the cutsite. - cutsite, `None`: represents the extraction of a fragment between the cutsite and the right edge of a linear sequence. Parameters ---------- cutsites : list[tuple[tuple[int,int], _AbstractCut]] Returns ------- list[tuple[tuple[tuple[int,int], _AbstractCut]|None],tuple[tuple[int,int], _AbstractCut]|None] Examples -------- >>> from Bio.Restriction import EcoRI >>> from pydna.dseq import Dseq >>> dseq = Dseq('aaGAATTCaaGAATTCaa') >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((3, -4), EcoRI), ((11, -4), EcoRI)] >>> dseq.get_cutsite_pairs(cutsites) [(None, ((3, -4), EcoRI)), (((3, -4), EcoRI), ((11, -4), EcoRI)), (((11, -4), EcoRI), None)] >>> dseq = Dseq('TTCaaGAA', circular=True) >>> cutsites = dseq.get_cutsites([EcoRI]) >>> cutsites [((6, -4), EcoRI)] >>> dseq.get_cutsite_pairs(cutsites) [(((6, -4), EcoRI), ((6, -4), EcoRI))] """ if len(cutsites) == 0: return [] if not self.circular: cutsites = [None, *cutsites, None] else: # Add the first cutsite at the end, for circular cuts cutsites.append(cutsites[0]) return list(zip(cutsites, cutsites[1:]))
if __name__ == "__main__": import os as _os cached = _os.getenv("pydna_cached_funcs", "") _os.environ["pydna_cached_funcs"] = "" import doctest doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS) _os.environ["pydna_cached_funcs"] = cached