#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2023 by Björn Johansson. All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license. Please see the LICENSE.txt file that should have been included
# as part of this package.
"""This module provides the :class:`Dseqrecord` class, for handling double stranded
DNA sequences. The Dseqrecord holds sequence information in the form of a :class:`pydna.dseq.Dseq`
object. The Dseq and Dseqrecord classes are subclasses of Biopythons
Seq and SeqRecord classes, respectively.
The Dseq and Dseqrecord classes support the notion of circular and linear DNA topology.
"""
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly
from pydna.dseq import Dseq as _Dseq
from pydna._pretty import pretty_str as _pretty_str
from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries
# from pydna.utils import memorize as _memorize
from pydna.utils import rc as _rc
from pydna.utils import shift_location as _shift_location
from pydna.utils import shift_feature as _shift_feature
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.SeqFeature import SeqFeature as _SeqFeature
from Bio import SeqIO
from Bio.SeqFeature import CompoundLocation as _CompoundLocation
from Bio.SeqFeature import SimpleLocation as _SimpleLocation
from pydna.seqrecord import SeqRecord as _SeqRecord
from Bio.Seq import translate as _translate
from pydna.utils import identifier_from_string as _identifier_from_string
import copy as _copy
import operator as _operator
import os as _os
import re as _re
import time as _time
import datetime as _datetime
import logging as _logging
_module_logger = _logging.getLogger("pydna." + __name__)
try:
from IPython.display import display_html as _display_html
except ImportError:
def _display_html(item, raw=None):
return item
[docs]class Dseqrecord(_SeqRecord):
"""Dseqrecord is a double stranded version of the Biopython SeqRecord [#]_ class.
The Dseqrecord object holds a Dseq object describing the sequence.
Additionally, Dseqrecord hold meta information about the sequence in the
from of a list of SeqFeatures, in the same way as the SeqRecord does.
The Dseqrecord can be initialized with a string, Seq, Dseq, SeqRecord
or another Dseqrecord. The sequence information will be stored in a
Dseq object in all cases.
Dseqrecord objects can be read or parsed from sequences in FASTA, EMBL or Genbank formats.
See the :mod:`pydna.readers` and :mod:`pydna.parsers` modules for further information.
There is a short representation associated with the Dseqrecord.
``Dseqrecord(-3)`` represents a linear sequence of length 2
while ``Dseqrecord(o7)``
represents a circular sequence of length 7.
Dseqrecord and Dseq share the same concept of length. This length can be larger
than each strand alone if they are staggered as in the example below.
::
<-- length -->
GATCCTTT
AAAGCCTAG
Parameters
----------
record : string, Seq, SeqRecord, Dseq or other Dseqrecord object
This data will be used to form the seq property
circular : bool, optional
True or False reflecting the shape of the DNA molecule
linear : bool, optional
True or False reflecting the shape of the DNA molecule
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("aaa")
>>> a
Dseqrecord(-3)
>>> a.seq
Dseq(-3)
aaa
ttt
>>> from pydna.seq import Seq
>>> b=Dseqrecord(Seq("aaa"))
>>> b
Dseqrecord(-3)
>>> b.seq
Dseq(-3)
aaa
ttt
>>> from Bio.SeqRecord import SeqRecord
>>> c=Dseqrecord(SeqRecord(Seq("aaa")))
>>> c
Dseqrecord(-3)
>>> c.seq
Dseq(-3)
aaa
ttt
References
----------
.. [#] http://biopython.org/wiki/SeqRecord
"""
def __init__(
self,
record,
*args,
circular=None,
n=5e-14, # mol ( = 0.05 pmol)
**kwargs,
):
_module_logger.info("### Dseqrecord initialized ###")
_module_logger.info("argument circular = %s", circular)
_module_logger.info("circular = %s", circular)
if isinstance(record, str):
_module_logger.info("record is a string")
super().__init__(
_Dseq.from_string(
record,
# linear=linear,
circular=bool(circular),
),
*args,
**kwargs,
)
# record is a Dseq object ?
elif hasattr(record, "watson"):
if circular is False:
record = record[:]
elif circular is True:
record = record.looped()
_module_logger.info("record is a Dseq object")
super().__init__(record, *args, **kwargs)
# record is a Bio.Seq object ?
elif hasattr(record, "transcribe"):
_module_logger.info("record is a Seq object")
super().__init__(
_Dseq(
str(record),
# linear=linear,
circular=bool(circular),
),
*args,
**kwargs,
)
# record is a Bio.SeqRecord or Dseqrecord object ?
elif hasattr(record, "features"):
_module_logger.info("record is a Bio.SeqRecord or Dseqrecord object")
for key, value in list(record.__dict__.items()):
setattr(self, key, value)
self.letter_annotations = {}
# record.seq is a Dseq object ?
if hasattr(record.seq, "watson"):
new_seq = _copy.copy(record.seq)
if circular is False:
new_seq = new_seq[:]
elif circular is True:
new_seq = new_seq.looped()
self.seq = new_seq
# record.seq is Bio.SeqRecord object ?
else:
self.seq = _Dseq(
str(record.seq),
# linear=linear,
circular=bool(circular),
)
else:
raise ValueError("don't know what to do with {}".format(record))
self.map_target = None
self.n = n # amount, set to 5E-14 which is 5 pmols
self.annotations.update({"molecule_type": "DNA"})
[docs] @classmethod
def from_string(
cls,
record: str = "",
*args,
# linear=True,
circular=False,
n=5e-14,
**kwargs,
):
"""docstring."""
# def from_string(cls, record:str="", *args,
# linear=True, circular=False, n = 5E-14, **kwargs):
obj = cls.__new__(cls) # Does not call __init__
obj._per_letter_annotations = {}
obj.seq = _Dseq.quick(
record,
_rc(record),
ovhg=0,
# linear=linear,
circular=circular,
)
obj.id = _pretty_str("id")
obj.name = _pretty_str("name")
obj.description = _pretty_str("description")
obj.dbxrefs = []
obj.annotations = {"molecule_type": "DNA"}
obj.features = []
obj.map_target = None
obj.n = n
obj.__dict__.update(kwargs)
return obj
[docs] @classmethod
def from_SeqRecord(
cls,
record: _SeqRecord,
*args,
circular=None,
n=5e-14,
**kwargs,
):
obj = cls.__new__(cls) # Does not call __init__
obj._per_letter_annotations = record._per_letter_annotations
obj.id = record.id
obj.name = record.name
obj.description = record.description
obj.dbxrefs = record.dbxrefs
obj.annotations = {"molecule_type": "DNA"}
obj.annotations.update(record.annotations)
obj.features = record.features
obj.map_target = None
obj.n = n
if circular is None:
circular = record.annotations.get("topology") == "circular"
obj.seq = _Dseq.quick(str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular)
return obj
@property
def circular(self):
"""The circular property can not be set directly.
Use :meth:`looped`"""
return self.seq.circular
[docs] def m(self):
"""This method returns the mass of the DNA molecule in grams. This is
calculated as the product between the molecular weight of the Dseq object
and the"""
return self.seq.mw() * self.n # Da(g/mol) * mol = g
[docs] def add_feature(self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs):
"""Add a feature of type misc to the feature list of the sequence.
Parameters
----------
x : int
Indicates start of the feature
y : int
Indicates end of the feature
Examples
--------
>>> from pydna.seqrecord import SeqRecord
>>> a=SeqRecord("atgtaa")
>>> a.features
[]
>>> a.add_feature(2,4)
>>> a.features
[SeqFeature(SimpleLocation(ExactPosition(2), ExactPosition(4), strand=1), type='misc', qualifiers=...)]
"""
if x and y and self.circular and x > y:
pass
else:
super().add_feature(x, y, seq, type_, strand=strand, *args, **kwargs)
return
qualifiers = {}
qualifiers.update(kwargs)
location = _CompoundLocation(
(
_SimpleLocation(x, self.seq.length, strand=strand),
_SimpleLocation(0, y, strand=strand),
)
)
sf = _SeqFeature(location, type=type_, qualifiers=qualifiers)
if "label" not in qualifiers:
qualifiers["label"] = [f"ft{len(location)}"]
if sf.extract(self).isorf():
qualifiers["label"] = [f"orf{len(location)}"]
self.features.append(sf)
[docs] def seguid(self):
"""Url safe SEGUID for the sequence.
This checksum is the same as seguid but with base64.urlsafe
encoding instead of the normal base64. This means that
the characters + and / are replaced with - and _ so that
the checksum can be part of a URL.
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a = Dseqrecord("aa")
>>> a.seguid()
'ldseguid=TEwydy0ugvGXh3VJnVwgtxoyDQA'
"""
return self.seq.seguid()
[docs] def looped(self):
"""
Circular version of the Dseqrecord object.
The underlying linear Dseq object has to have compatible ends.
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("aaa")
>>> a
Dseqrecord(-3)
>>> b=a.looped()
>>> b
Dseqrecord(o3)
>>>
See Also
--------
pydna.dseq.Dseq.looped
"""
new = _copy.copy(self)
# for key, value in list(self.__dict__.items()):
# setattr(new, key, value)
new._seq = self.seq.looped()
five_prime = self.seq.five_prime_end()
for fn, fo in zip(new.features, self.features):
if five_prime[0] == "5'":
pass
# fn.location = fn.location + self.seq.ovhg
elif five_prime[0] == "3'":
fn.location = fn.location + (-self.seq.ovhg)
if fn.location.start < 0:
loc1 = _SimpleLocation(len(new) + fn.location.start, len(new), strand=fn.location.strand)
loc2 = _SimpleLocation(0, fn.location.end, strand=fn.location.strand)
fn.location = _CompoundLocation([loc1, loc2])
if fn.location.end > len(new):
loc1 = _SimpleLocation(fn.location.start, len(new), strand=fn.location.strand)
loc2 = _SimpleLocation(0, fn.location.end - len(new), strand=fn.location.strand)
fn.location = _CompoundLocation([loc1, loc2])
fn.qualifiers = fo.qualifiers
return new
[docs] def tolinear(self): # pragma: no cover
"""
Returns a linear, blunt copy of a circular Dseqrecord object. The
underlying Dseq object has to be circular.
This method is deprecated, use slicing instead. See example below.
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("aaa", circular = True)
>>> a
Dseqrecord(o3)
>>> b=a[:]
>>> b
Dseqrecord(-3)
>>>
"""
import warnings as _warnings
from pydna import _PydnaDeprecationWarning
_warnings.warn(
"tolinear method is obsolete; " "please use obj[:] " "instead of obj.tolinear().",
_PydnaDeprecationWarning,
)
new = _copy.copy(self)
for key, value in list(self.__dict__.items()):
setattr(new, key, value)
# new._seq = self.seq.tolinear()
for fn, fo in zip(new.features, self.features):
fn.qualifiers = fo.qualifiers
return new
[docs] def terminal_transferase(self, nucleotides="a"):
"""docstring."""
newseq = _copy.deepcopy(self)
newseq.seq = self.seq.terminal_transferase(nucleotides)
for feature in newseq.features:
feature.location += len(nucleotides)
return newseq
[docs] def write(self, filename=None, f="gb"):
"""Writes the Dseqrecord to a file using the format f, which must
be a format supported by Biopython SeqIO for writing [#]_. Default
is "gb" which is short for Genbank. Note that Biopython SeqIO reads
more formats than it writes.
Filename is the path to the file where the sequece is to be
written. The filename is optional, if it is not given, the
description property (string) is used together with the format.
If obj is the Dseqrecord object, the default file name will be:
``<obj.locus>.<f>``
Where <f> is "gb" by default. If the filename already exists and
AND the sequence it contains is different, a new file name will be
used so that the old file is not lost:
``<obj.locus>_NEW.<f>``
References
----------
.. [#] http://biopython.org/wiki/SeqIO
"""
msg = ""
if not filename:
filename = "{name}.{type}".format(name=self.locus, type=f)
# generate a name if no name was given
# if not isinstance(filename, str): # is filename a string???
# raise ValueError("filename has to be a string, got", type(filename))
name, ext = _os.path.splitext(filename)
msg = f"<font face=monospace><a href='{filename}' target='_blank'>{filename}</a></font><br>"
if not _os.path.isfile(filename):
with open(filename, "w", encoding="utf8") as fp:
fp.write(self.format(f))
else:
from pydna.readers import read
old_file = read(filename)
if self.seq != old_file.seq:
# If new sequence is different, the old file is
# renamed with "_OLD_" suffix:
oldmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
tstmp = int(_time.time() * 1_000_000)
old_filename = f"{name}_OLD_{tstmp}{ext}"
_os.rename(filename, old_filename)
with open(filename, "w", encoding="utf8") as fp:
fp.write(self.format(f))
newmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat()
msg = f"""
<table style="padding:10px 10px;
word-break:normal;
border-color:#fe0000;
border-collapse:collapse;
border-spacing:1;
font-family:monospace;
font-size:large;
font-weight:bold;
text-align:left;
border: 5px solid red;">
<thead>
<tr style="color:#0000FF;border: 1px solid;text-align:left;">
<th style="color:#fe0000;border: 1px solid;text-align:center;font-size:xxx-large;text-align:left;">⚠</th>
<th style="color:#f56b00;border: 1px solid;text-align:left;" colspan="2">Sequence change</th>
</tr>
</thead>
<tbody>
<tr style="color:#0000FF;border: 1px solid;text-align:left;">
<td>Filename</td>
<td style="color:#fe0000;border: 1px solid;text-align:left;"><a href='{filename}' target='_blank'>{filename}</a></td>
<td style="color:#32cb00;border: 1px solid;text-align:left;"><a href='{old_filename}' target='_blank'>{old_filename}</a></td>
</tr>
<tr style="color:#0000FF;border: 1px solid;text-align:left;">
<td >Saved</td>
<td style="color:#fe0000;border: 1px solid;text-align:left;">{newmtime}</td>
<td style="color:#32cb00;border: 1px solid;text-align:left;">{oldmtime}</td>
</tr>
<tr style="color:#0000FF;border: 1px solid;text-align:left;">
<td>Length</td>
<td style="color:#fe0000;border: 1px solid;text-align:left;">{len(self)}</td>
<td style="color:#32cb00;border: 1px solid;text-align:left;">{len(old_file)}</td>
</tr>
<tr style="color:#0000FF;border: 1px solid;text-align:left;">
<td>SEGUID</td>
<td style="color:#fe0000;border: 1px solid;text-align:left;">{self.seguid()}</td>
<td style="color:#32cb00;border: 1px solid;text-align:left;">{old_file.seguid()}</td>
</tr>
</tbody>
</table>
"""
elif "seguid" in old_file.annotations.get("comment", ""):
pattern = r"(ldseguid|cdseguid)-(\S{27})(_[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}){0,1}"
# seguid=NNNNNNNNNNNNNNNNNNNNNNNNNNN_2020-10-10T11:11:11.111111
oldstamp = _re.search(pattern, old_file.description)
newstamp = _re.search(pattern, self.description)
newdescription = self.description
if oldstamp and newstamp:
if oldstamp.group(0)[:35] == newstamp.group(0)[:35]:
newdescription = newdescription.replace(newstamp.group(0), oldstamp.group(0))
elif oldstamp:
newdescription += " " + oldstamp.group(0)
newobj = _copy.copy(self)
newobj.description = newdescription
with open(filename, "w", encoding="utf8") as fp:
fp.write(newobj.format(f))
else:
with open(filename, "w", encoding="utf8") as fp:
fp.write(self.format(f))
return _display_html(msg, raw=True)
[docs] def find(self, other):
# TODO allow strings, seqs, seqrecords or Dseqrecords
# TODO check for linearity of other, raise exception if not
# TODO add tests and docstring for this method
o = str(other.seq).upper()
if not self.circular:
s = str(self.seq).upper()
else:
# allow wrapping around origin
s = str(self.seq).upper() + str(self.seq).upper()[: len(other) - 1]
return s.find(o)
def __str__(self):
return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(self.circular, len(self)) + _SeqRecord.__str__(
self
)
def __contains__(self, other):
if other.lower() in str(self.seq).lower():
return True
else:
s = self.seq.watson.replace(" ", "")
ln = len(s)
spc = 3 - ln % 3 if ln % 3 else 0
s = "n" * spc + s + "nnn"
for frame in range(3):
if other.lower() in _translate(s[frame : frame + spc + ln]).lower():
return True
return False
[docs] def find_aminoacids(self, other):
"""
>>> from pydna.dseqrecord import Dseqrecord
>>> s=Dseqrecord("atgtacgatcgtatgctggttatattttag")
>>> s.seq.translate()
Seq('MYDRMLVIF*')
>>> "RML" in s
True
>>> "MMM" in s
False
>>> s.seq.rc().translate()
Seq('LKYNQHTIVH')
>>> "QHT" in s.rc()
True
>>> "QHT" in s
False
>>> slc = s.find_aa("RML")
>>> slc
slice(9, 18, None)
>>> s[slc]
Dseqrecord(-9)
>>> code = s[slc].seq
>>> code
Dseq(-9)
cgtatgctg
gcatacgac
>>> code.translate()
Seq('RML')
"""
other = str(other).lower()
assert self.seq.watson == "".join(self.seq.watson.split())
s = self.seq.watson
ln = len(s)
spc = 3 - ln % 3 if ln % 3 else 0
s = s + "n" * spc + "nnn"
start = None
for frame in range(3):
try:
start = _translate(s[frame : frame + ln + spc]).lower().index(other)
break
except ValueError:
pass
oh = self.seq.ovhg if self.seq.ovhg > 0 else 0
if start is None:
return None # TODO return an emoty slice or False...?
else:
return slice(frame + start * 3 + oh, frame + (start + len(other)) * 3 + oh)
find_aa = find_aminoacids
[docs] def map_trace_files(self, pth, limit=25): # TODO allow path-like objects
import glob
traces = []
for name in glob.glob(pth):
trace = SeqIO.read(name, "abi").lower()
trace.annotations["filename"] = trace.fname = name
traces.append(trace)
if not traces:
raise ValueError("No trace files found in {}".format(pth))
if hasattr(self.map_target, "step"):
area = self.map_target
elif hasattr(self.map_target, "extract"):
area = slice(self.map_target.location.start, self.map_target.location.end)
else:
area = None # TODO allow other objects as well and do some checks on map target
if area:
self.matching_reads = []
self.not_matching_reads = []
target = str(self[area].seq).lower()
target_rc = str(self[area].seq.rc()).lower()
for trace in traces:
if target in str(trace.seq) or target_rc in str(trace.seq):
self.matching_reads.append(trace)
else:
self.not_matching_reads.append(trace)
reads = self.matching_reads
else:
self.matching_reads = None
self.not_matching_reads = None
reads = traces
matching_reads = []
for read_ in reads:
matches = _common_sub_strings(str(self.seq).lower(), str(read_.seq), limit)
if not matches:
continue
if len(matches) > 1:
newmatches = [
matches[0],
]
for i, x in enumerate(matches[1:]):
g, f, h = matches[i]
if g + h < x[0] and f + h < x[1]:
newmatches.append(x)
else: # len(matches)==1
newmatches = matches
matching_reads.append(read_)
if len(newmatches) > 1:
ms = []
for m in newmatches:
ms.append(_SimpleLocation(m[0], m[0] + m[2]))
loc = _CompoundLocation(ms)
else:
a, b, c = newmatches[0]
loc = _SimpleLocation(a, a + c)
self.features.append(
_SeqFeature(
loc,
qualifiers={"label": [read_.annotations["filename"]]},
type="trace",
)
)
return [x.annotations["filename"] for x in matching_reads]
def __repr__(self):
return "Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self))
def _repr_pretty_(self, p, cycle):
p.text("Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self)))
def __add__(self, other):
if hasattr(other, "seq") and hasattr(other.seq, "watson"):
other = _copy.deepcopy(other)
other_five_prime = other.seq.five_prime_end()
if other_five_prime[0] == "5'":
# add other.seq.ovhg
for f in other.features:
f.location = f.location + other.seq.ovhg
elif other_five_prime[0] == "3'":
# subtract other.seq.ovhg (sign change)
for f in other.features:
f.location = f.location + (-other.seq.ovhg)
answer = Dseqrecord(_SeqRecord.__add__(self, other))
answer.n = min(self.n, other.n)
else:
answer = Dseqrecord(_SeqRecord.__add__(self, Dseqrecord(other)))
answer.n = self.n
return answer
def __mul__(self, number):
if not isinstance(number, int):
raise TypeError("TypeError: can't multiply Dseqrecord by non-int of type {}".format(type(number)))
if self.circular:
raise TypeError("TypeError: can't multiply circular Dseqrecord.")
if number > 0:
new = _copy.copy(self)
for i in range(1, number):
new += self
return new
else:
return self.__class__("")
def __getitem__(self, sl):
"""docstring."""
answer = Dseqrecord(_copy.copy(self))
answer.seq = self.seq.__getitem__(sl)
# answer.seq.alphabet = self.seq.alphabet
# breakpoint()
sl_start = sl.start or 0 # 6
sl_stop = sl.stop or len(self.seq) # 1
if not self.circular or sl_start < sl_stop:
# TODO: special case for sl_end == 0 in circular sequences
# related to https://github.com/BjornFJohansson/pydna/issues/161
if self.circular and sl.stop == 0:
sl = slice(sl.start, len(self.seq), sl.step)
answer.features = super().__getitem__(sl).features
elif self.circular and sl_start > sl_stop:
answer.features = self.shifted(sl_start).features
# origin-spanning features should only be included after shifting
# in cases where the slice comprises the entire sequence, but then
# sl_start == sl_stop and the second condition is not met
answer.features = [
f
for f in answer.features
if (
_location_boundaries(f.location)[1] <= answer.seq.length
and _location_boundaries(f.location)[0] < _location_boundaries(f.location)[1]
)
]
elif self.circular and sl_start == sl_stop:
cut = ((sl_start, 0), None)
return self.apply_cut(cut, cut)
else:
answer = Dseqrecord("")
identifier = "part_{id}".format(id=self.id)
if answer.features:
sf = max(answer.features, key=len) # default
if "label" in sf.qualifiers:
identifier = " ".join(sf.qualifiers["label"])
elif "note" in sf.qualifiers:
identifier = " ".join(sf.qualifiers["note"])
answer.id = _identifier_from_string(identifier)[:16]
answer.name = _identifier_from_string("part_{name}".format(name=self.name))[:16]
return answer
def __eq__(self, other):
"""docstring."""
try:
if self.seq == other.seq and str(self.__dict__) == str(other.__dict__):
return True
except AttributeError:
pass
return False
def __ne__(self, other):
"""docstring."""
return not self.__eq__(other)
def __hash__(self):
"""__hash__ must be based on __eq__."""
return hash((str(self.seq).lower(), str(tuple(sorted(self.__dict__.items())))))
[docs] def linearize(self, *enzymes):
"""Similar to `:func:cut`.
Throws an exception if there is not excactly one cut
i.e. none or more than one digestion products.
"""
if not self.seq.circular:
raise TypeError("Can only linearize circular molecules!")
fragments = self.cut(*enzymes)
if len(fragments) > 1:
raise TypeError("More than one fragment is formed!")
elif not fragments:
raise TypeError("The enzyme(s) do not cut!")
answer = fragments[0]
answer.id = "{name}_lin".format(name=self.name)
answer.name = answer.id[:16]
return fragments[0]
[docs] def no_cutters(self, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.no_cutters(batch=batch or CommOnly)
[docs] def unique_cutters(self, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.unique_cutters(batch=batch or CommOnly)
[docs] def once_cutters(self, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.once_cutters(batch=batch or CommOnly)
[docs] def twice_cutters(self, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.twice_cutters(batch=batch or CommOnly)
[docs] def n_cutters(self, n=3, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.n_cutters(n=n, batch=batch or CommOnly)
[docs] def cutters(self, batch: _RestrictionBatch = None):
"""docstring."""
return self.seq.cutters(batch=batch or CommOnly)
[docs] def number_of_cuts(self, *enzymes):
"""The number of cuts by digestion with the Restriction enzymes
contained in the iterable."""
return sum([len(enzyme.search(self.seq)) for enzyme in _flatten(enzymes)])
[docs] def cas9(self, RNA: str):
"""docstring."""
fragments = []
result = []
for target in (self.seq, self.seq.rc()):
fragments = [self[sl.start : sl.stop] for sl in target.cas9(RNA)]
result.append(fragments)
return result
[docs] def reverse_complement(self):
"""Reverse complement.
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("ggaatt")
>>> a
Dseqrecord(-6)
>>> a.seq
Dseq(-6)
ggaatt
ccttaa
>>> a.reverse_complement().seq
Dseq(-6)
aattcc
ttaagg
>>>
See Also
--------
pydna.dseq.Dseq.reverse_complement
"""
answer = type(self)(super().reverse_complement())
answer.name = "{}_rc".format(self.name[:13])
answer.description = self.description + "_rc"
answer.id = self.id + "_rc"
answer.seq.circular = self.seq.circular
# answer.seq._linear = self.seq.linear
return answer
rc = reverse_complement
# @_memorize("pydna.dseqrecord.Dseqrecord.synced")
[docs] def synced(self, ref, limit=25):
"""This method returns a new circular sequence (Dseqrecord object), which has been rotated
in such a way that there is maximum overlap between the sequence and
ref, which may be a string, Biopython Seq, SeqRecord object or
another Dseqrecord object.
The reason for using this could be to rotate a new recombinant plasmid so
that it starts at the same position after cloning. See the example below:
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("gaat", circular=True)
>>> a.seq
Dseq(o4)
gaat
ctta
>>> d = a[2:] + a[:2]
>>> d.seq
Dseq(-4)
atga
tact
>>> insert=Dseqrecord("CCC")
>>> recombinant = (d+insert).looped()
>>> recombinant.seq
Dseq(o7)
atgaCCC
tactGGG
>>> recombinant.synced(a).seq
Dseq(o7)
gaCCCat
ctGGGta
"""
if not self.circular:
raise TypeError("Only circular DNA can be synced!")
newseq = _copy.copy(self)
s = str(self.seq.watson).lower()
s_rc = str(self.seq.crick).lower()
if hasattr(ref, "seq"):
r = ref.seq
if hasattr(r, "watson"):
r = str(r.watson).lower()
else:
r = str(r).lower()
else:
r = str(ref.lower())
lim = min(limit, limit * (len(s) // limit) + 1)
c = _common_sub_strings(s + s, r, limit=lim)
d = _common_sub_strings(s_rc + s_rc, r, limit=lim)
c = [(x[0], x[2]) for x in c if x[1] == 0]
d = [(x[0], x[2]) for x in d if x[1] == 0]
if not c and not d:
raise TypeError("There is no overlap between sequences!")
if c:
start, length = c.pop(0)
else:
start, length = 0, 0
if d:
start_rc, length_rc = d.pop(0)
else:
start_rc, length_rc = 0, 0
if length_rc > length:
start = start_rc
newseq = newseq.rc()
if start == 0:
result = newseq
else:
result = newseq.shifted(start)
_module_logger.info("synced")
return result
[docs] def upper(self):
"""Returns an uppercase copy.
>>> from pydna.dseqrecord import Dseqrecord
>>> my_seq = Dseqrecord("aAa")
>>> my_seq.seq
Dseq(-3)
aAa
tTt
>>> upper = my_seq.upper()
>>> upper.seq
Dseq(-3)
AAA
TTT
>>>
Returns
-------
Dseqrecord
Dseqrecord object in uppercase
See also
--------
pydna.dseqrecord.Dseqrecord.lower"""
upper = _copy.deepcopy(self)
upper.seq = upper.seq.upper()
return upper
[docs] def lower(self):
""">>> from pydna.dseqrecord import Dseqrecord
>>> my_seq = Dseqrecord("aAa")
>>> my_seq.seq
Dseq(-3)
aAa
tTt
>>> upper = my_seq.upper()
>>> upper.seq
Dseq(-3)
AAA
TTT
>>> lower = my_seq.lower()
>>> lower
Dseqrecord(-3)
>>>
Returns
-------
Dseqrecord
Dseqrecord object in lowercase
See also
--------
pydna.dseqrecord.Dseqrecord.upper
"""
lower = _copy.deepcopy(self)
lower.seq = lower.seq.lower()
return lower
[docs] def orfs(self, minsize=300):
"""docstring."""
return tuple(Dseqrecord(self[x:y]) for x, y in self.seq.orfs(minsize=minsize))
[docs] def orfs_to_features(self, minsize=300):
"""docstring."""
features = []
for strand, s in ((1, self.seq), (-1, self.seq.rc())):
for x, y in s.orfs(minsize=minsize):
orf = self[x:y]
prt = orf.translate()
features.append(
_SeqFeature(
_SimpleLocation(x, y, strand=strand),
type="CDS",
qualifiers={
"note": f"{y - x}bp {(y - x) // 3}aa",
"checksum": [orf.seguid() + " (DNA)", prt.seguid() + " (protein)"],
"codon_start": 1,
"transl_table": 11,
"translation": str(prt.seq),
},
)
)
return features
def _copy_to_clipboard(self, sequence_format):
"""docstring."""
from pyperclip import copy
copy(self.format(sequence_format))
return None
[docs] def copy_gb_to_clipboard(self):
"""docstring."""
self._copy_to_clipboard("gb")
return None
[docs] def copy_fasta_to_clipboard(self):
"""docstring."""
self._copy_to_clipboard("fasta")
return None
[docs] def shifted(self, shift):
"""Circular Dseqrecord with a new origin <shift>.
This only works on circular Dseqrecords. If we consider the following
circular sequence:
| ``GAAAT <-- watson strand``
| ``CTTTA <-- crick strand``
The T and the G on the watson strand are linked together as well
as the A and the C of the of the crick strand.
if ``shift`` is 1, this indicates a new origin at position 1:
| new origin at the | symbol:
|
| ``G|AAAT``
| ``C|TTTA``
new sequence:
| ``AAATG``
| ``TTTAC``
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("aaat",circular=True)
>>> a
Dseqrecord(o4)
>>> a.seq
Dseq(o4)
aaat
ttta
>>> b=a.shifted(1)
>>> b
Dseqrecord(o4)
>>> b.seq
Dseq(o4)
aata
ttat
"""
if not self.circular:
raise TypeError("Sequence is linear, origin can only be " "shifted for circular sequences.\n")
ln = len(self)
if not shift % ln:
return _copy.deepcopy(self) # shift is a multiple of ln or 0
else:
shift %= ln # 0<=shift<=ln
newseq = (self.seq[shift:] + self.seq[:shift]).looped()
newfeatures = _copy.deepcopy(self.features)
for feature in newfeatures:
feature.location = _shift_location(feature.location, -shift, ln)
newfeatures.sort(key=_operator.attrgetter("location.start"))
answer = _copy.deepcopy(self)
answer.features = newfeatures
answer.seq = newseq
return answer
[docs] def cut(self, *enzymes):
"""Digest a Dseqrecord object with one or more restriction enzymes.
returns a list of linear Dseqrecords. If there are no cuts, an empty
list is returned.
See also :func:`Dseq.cut`
Parameters
----------
enzymes : enzyme object or iterable of such objects
A Bio.Restriction.XXX restriction object or iterable of such.
Returns
-------
Dseqrecord_frags : list
list of Dseqrecord objects formed by the digestion
Examples
--------
>>> from pydna.dseqrecord import Dseqrecord
>>> a=Dseqrecord("ggatcc")
>>> from Bio.Restriction import BamHI
>>> a.cut(BamHI)
(Dseqrecord(-5), Dseqrecord(-5))
>>> frag1, frag2 = a.cut(BamHI)
>>> frag1.seq
Dseq(-5)
g
cctag
>>> frag2.seq
Dseq(-5)
gatcc
g
"""
cutsites = self.seq.get_cutsites(*enzymes)
cutsite_pairs = self.seq.get_cutsite_pairs(cutsites)
return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
[docs] def apply_cut(self, left_cut, right_cut):
dseq = self.seq.apply_cut(left_cut, right_cut)
# TODO: maybe remove depending on https://github.com/BjornFJohansson/pydna/issues/161
if left_cut == right_cut:
# Not really a cut, but to handle the general case
if left_cut is None:
features = _copy.deepcopy(self.features)
else:
# The features that span the origin if shifting with left_cut, but that do not cross
# the cut site should be included, and if there is a feature within the cut site, it should
# be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example.
#
# Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate
# features)
#
# 3333|3
# 1111
# 000
# XXXXatg|YYY
# XXX|tacYYYY
# 000
# 2222
#
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True)
initial_shift = left_watson if left_ovhg < 0 else left_crick
features = self.shifted(initial_shift).features
# for f in features:
# print(f.id, f.location, _location_boundaries(f.location))
# Here, we have done what's shown below (* indicates the origin).
# The features 0 and 2 have the right location for the final product:
#
# 3*3333
# 1*111
# XXXX*atgYYY
# XXXX*tacYYY
# 000
# 2222
features_need_transfer = [
f for f in features if (_location_boundaries(f.location)[1] <= abs(left_ovhg))
]
features_need_transfer = [
_shift_feature(f, -abs(left_ovhg), len(self)) for f in features_need_transfer
]
# ^ ^^^^^^^^^
# Now we have shifted the features that end before the cut (0 and 1, but not 3), as if
# they referred to the below sequence (* indicates the origin):
#
# 1111
# 000
# XXXXatg*YYY
# XXXXtac*YYY
#
# The features 0 and 1 would have the right location if the final sequence had the same length
# as the original one. However, the final product is longer because of the overhang.
features += [_shift_feature(f, abs(left_ovhg), len(dseq)) for f in features_need_transfer]
# ^ ^^^^^^^^^
# So we shift back by the same amount in the opposite direction, but this time we pass the
# length of the final product.
# print(*features, sep='\n')
# Features like 3 are removed here
features = [
f
for f in features
if (
_location_boundaries(f.location)[1] <= len(dseq)
and _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1]
)
]
else:
left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True)
right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(right_cut, False)
left_edge = left_crick if left_ovhg > 0 else left_watson
right_edge = right_watson if right_ovhg > 0 else right_crick
features = self[left_edge:right_edge].features
return Dseqrecord(dseq, features=features)
if __name__ == "__main__":
cache = _os.getenv("pydna_cache")
_os.environ["pydna_cache"] = "nocache"
import doctest
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
# _os.environ["pydna_cache"] = cache