Source code for pydna.dseqrecord

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2023 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.
"""This module provides the :class:`Dseqrecord` class, for handling double stranded
DNA sequences. The Dseqrecord holds sequence information in the form of a :class:`pydna.dseq.Dseq`
object. The Dseq and Dseqrecord classes are subclasses of Biopythons
Seq and SeqRecord classes, respectively.

The Dseq and Dseqrecord classes support the notion of circular and linear DNA topology.
"""
from Bio.Restriction import RestrictionBatch as _RestrictionBatch
from Bio.Restriction import CommOnly
from pydna.dseq import Dseq as _Dseq
from pydna._pretty import pretty_str as _pretty_str
from pydna.utils import flatten as _flatten, location_boundaries as _location_boundaries

# from pydna.utils import memorize as _memorize
from pydna.utils import rc as _rc
from pydna.utils import shift_location as _shift_location
from pydna.utils import shift_feature as _shift_feature
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings
from Bio.SeqFeature import SeqFeature as _SeqFeature
from Bio import SeqIO
from Bio.SeqFeature import CompoundLocation as _CompoundLocation
from Bio.SeqFeature import SimpleLocation as _SimpleLocation
from pydna.seqrecord import SeqRecord as _SeqRecord
from Bio.Seq import translate as _translate
from pydna.utils import identifier_from_string as _identifier_from_string
import copy as _copy
import operator as _operator
import os as _os
import re as _re
import time as _time
import datetime as _datetime


import logging as _logging

_module_logger = _logging.getLogger("pydna." + __name__)


try:
    from IPython.display import display_html as _display_html
except ImportError:

    def _display_html(item, raw=None):
        return item


[docs]class Dseqrecord(_SeqRecord): """Dseqrecord is a double stranded version of the Biopython SeqRecord [#]_ class. The Dseqrecord object holds a Dseq object describing the sequence. Additionally, Dseqrecord hold meta information about the sequence in the from of a list of SeqFeatures, in the same way as the SeqRecord does. The Dseqrecord can be initialized with a string, Seq, Dseq, SeqRecord or another Dseqrecord. The sequence information will be stored in a Dseq object in all cases. Dseqrecord objects can be read or parsed from sequences in FASTA, EMBL or Genbank formats. See the :mod:`pydna.readers` and :mod:`pydna.parsers` modules for further information. There is a short representation associated with the Dseqrecord. ``Dseqrecord(-3)`` represents a linear sequence of length 2 while ``Dseqrecord(o7)`` represents a circular sequence of length 7. Dseqrecord and Dseq share the same concept of length. This length can be larger than each strand alone if they are staggered as in the example below. :: <-- length --> GATCCTTT AAAGCCTAG Parameters ---------- record : string, Seq, SeqRecord, Dseq or other Dseqrecord object This data will be used to form the seq property circular : bool, optional True or False reflecting the shape of the DNA molecule linear : bool, optional True or False reflecting the shape of the DNA molecule Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaa") >>> a Dseqrecord(-3) >>> a.seq Dseq(-3) aaa ttt >>> from pydna.seq import Seq >>> b=Dseqrecord(Seq("aaa")) >>> b Dseqrecord(-3) >>> b.seq Dseq(-3) aaa ttt >>> from Bio.SeqRecord import SeqRecord >>> c=Dseqrecord(SeqRecord(Seq("aaa"))) >>> c Dseqrecord(-3) >>> c.seq Dseq(-3) aaa ttt References ---------- .. [#] http://biopython.org/wiki/SeqRecord """ def __init__( self, record, *args, circular=None, n=5e-14, # mol ( = 0.05 pmol) **kwargs, ): _module_logger.info("### Dseqrecord initialized ###") _module_logger.info("argument circular = %s", circular) _module_logger.info("circular = %s", circular) if isinstance(record, str): _module_logger.info("record is a string") super().__init__( _Dseq.from_string( record, # linear=linear, circular=bool(circular), ), *args, **kwargs, ) # record is a Dseq object ? elif hasattr(record, "watson"): if circular is False: record = record[:] elif circular is True: record = record.looped() _module_logger.info("record is a Dseq object") super().__init__(record, *args, **kwargs) # record is a Bio.Seq object ? elif hasattr(record, "transcribe"): _module_logger.info("record is a Seq object") super().__init__( _Dseq( str(record), # linear=linear, circular=bool(circular), ), *args, **kwargs, ) # record is a Bio.SeqRecord or Dseqrecord object ? elif hasattr(record, "features"): _module_logger.info("record is a Bio.SeqRecord or Dseqrecord object") for key, value in list(record.__dict__.items()): setattr(self, key, value) self.letter_annotations = {} # record.seq is a Dseq object ? if hasattr(record.seq, "watson"): new_seq = _copy.copy(record.seq) if circular is False: new_seq = new_seq[:] elif circular is True: new_seq = new_seq.looped() self.seq = new_seq # record.seq is Bio.SeqRecord object ? else: self.seq = _Dseq( str(record.seq), # linear=linear, circular=bool(circular), ) else: raise ValueError("don't know what to do with {}".format(record)) self.map_target = None self.n = n # amount, set to 5E-14 which is 5 pmols self.annotations.update({"molecule_type": "DNA"})
[docs] @classmethod def from_string( cls, record: str = "", *args, # linear=True, circular=False, n=5e-14, **kwargs, ): """docstring.""" # def from_string(cls, record:str="", *args, # linear=True, circular=False, n = 5E-14, **kwargs): obj = cls.__new__(cls) # Does not call __init__ obj._per_letter_annotations = {} obj.seq = _Dseq.quick( record, _rc(record), ovhg=0, # linear=linear, circular=circular, ) obj.id = _pretty_str("id") obj.name = _pretty_str("name") obj.description = _pretty_str("description") obj.dbxrefs = [] obj.annotations = {"molecule_type": "DNA"} obj.features = [] obj.map_target = None obj.n = n obj.__dict__.update(kwargs) return obj
[docs] @classmethod def from_SeqRecord( cls, record: _SeqRecord, *args, circular=None, n=5e-14, **kwargs, ): obj = cls.__new__(cls) # Does not call __init__ obj._per_letter_annotations = record._per_letter_annotations obj.id = record.id obj.name = record.name obj.description = record.description obj.dbxrefs = record.dbxrefs obj.annotations = {"molecule_type": "DNA"} obj.annotations.update(record.annotations) obj.features = record.features obj.map_target = None obj.n = n if circular is None: circular = record.annotations.get("topology") == "circular" obj.seq = _Dseq.quick(str(record.seq), _rc(str(record.seq)), ovhg=0, circular=circular) return obj
@property def circular(self): """The circular property can not be set directly. Use :meth:`looped`""" return self.seq.circular
[docs] def m(self): """This method returns the mass of the DNA molecule in grams. This is calculated as the product between the molecular weight of the Dseq object and the""" return self.seq.mw() * self.n # Da(g/mol) * mol = g
[docs] def extract_feature(self, n): """Extracts a feature and creates a new Dseqrecord object. Parameters ---------- n : int Indicates the feature to extract Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("atgtaa") >>> a.add_feature(2,4) >>> b=a.extract_feature(0) >>> b Dseqrecord(-2) >>> b.seq Dseq(-2) gt ca """ return super().extract_feature(n)
[docs] def add_feature(self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs): """Add a feature of type misc to the feature list of the sequence. Parameters ---------- x : int Indicates start of the feature y : int Indicates end of the feature Examples -------- >>> from pydna.seqrecord import SeqRecord >>> a=SeqRecord("atgtaa") >>> a.features [] >>> a.add_feature(2,4) >>> a.features [SeqFeature(SimpleLocation(ExactPosition(2), ExactPosition(4), strand=1), type='misc', qualifiers=...)] """ if x and y and self.circular and x > y: pass else: super().add_feature(x, y, seq, type_, strand=strand, *args, **kwargs) return qualifiers = {} qualifiers.update(kwargs) location = _CompoundLocation( ( _SimpleLocation(x, self.seq.length, strand=strand), _SimpleLocation(0, y, strand=strand), ) ) sf = _SeqFeature(location, type=type_, qualifiers=qualifiers) if "label" not in qualifiers: qualifiers["label"] = [f"ft{len(location)}"] if sf.extract(self).isorf(): qualifiers["label"] = [f"orf{len(location)}"] self.features.append(sf)
[docs] def seguid(self): """Url safe SEGUID for the sequence. This checksum is the same as seguid but with base64.urlsafe encoding instead of the normal base64. This means that the characters + and / are replaced with - and _ so that the checksum can be part of a URL. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a = Dseqrecord("aa") >>> a.seguid() 'ldseguid=TEwydy0ugvGXh3VJnVwgtxoyDQA' """ return self.seq.seguid()
[docs] def looped(self): """ Circular version of the Dseqrecord object. The underlying linear Dseq object has to have compatible ends. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaa") >>> a Dseqrecord(-3) >>> b=a.looped() >>> b Dseqrecord(o3) >>> See Also -------- pydna.dseq.Dseq.looped """ new = _copy.copy(self) # for key, value in list(self.__dict__.items()): # setattr(new, key, value) new._seq = self.seq.looped() five_prime = self.seq.five_prime_end() for fn, fo in zip(new.features, self.features): if five_prime[0] == "5'": pass # fn.location = fn.location + self.seq.ovhg elif five_prime[0] == "3'": fn.location = fn.location + (-self.seq.ovhg) if fn.location.start < 0: loc1 = _SimpleLocation(len(new) + fn.location.start, len(new), strand=fn.location.strand) loc2 = _SimpleLocation(0, fn.location.end, strand=fn.location.strand) fn.location = _CompoundLocation([loc1, loc2]) if fn.location.end > len(new): loc1 = _SimpleLocation(fn.location.start, len(new), strand=fn.location.strand) loc2 = _SimpleLocation(0, fn.location.end - len(new), strand=fn.location.strand) fn.location = _CompoundLocation([loc1, loc2]) fn.qualifiers = fo.qualifiers return new
[docs] def tolinear(self): # pragma: no cover """ Returns a linear, blunt copy of a circular Dseqrecord object. The underlying Dseq object has to be circular. This method is deprecated, use slicing instead. See example below. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaa", circular = True) >>> a Dseqrecord(o3) >>> b=a[:] >>> b Dseqrecord(-3) >>> """ import warnings as _warnings from pydna import _PydnaDeprecationWarning _warnings.warn( "tolinear method is obsolete; " "please use obj[:] " "instead of obj.tolinear().", _PydnaDeprecationWarning, ) new = _copy.copy(self) for key, value in list(self.__dict__.items()): setattr(new, key, value) # new._seq = self.seq.tolinear() for fn, fo in zip(new.features, self.features): fn.qualifiers = fo.qualifiers return new
[docs] def terminal_transferase(self, nucleotides="a"): """docstring.""" newseq = _copy.deepcopy(self) newseq.seq = self.seq.terminal_transferase(nucleotides) for feature in newseq.features: feature.location += len(nucleotides) return newseq
[docs] def format(self, f="gb"): """Returns the sequence as a string using a format supported by Biopython SeqIO [#]_. Default is "gb" which is short for Genbank. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> x=Dseqrecord("aaa") >>> x.annotations['date'] = '02-FEB-2013' >>> x Dseqrecord(-3) >>> print(x.format("gb")) LOCUS name 3 bp DNA linear UNK 02-FEB-2013 DEFINITION description. ACCESSION id VERSION id KEYWORDS . SOURCE . ORGANISM . . FEATURES Location/Qualifiers ORIGIN 1 aaa // References ---------- .. [#] http://biopython.org/wiki/SeqIO """ record = _copy.deepcopy(self) if f in ("genbank", "gb") and self.circular: record.annotations["topology"] = "circular" else: record.annotations["topology"] = "linear" return _SeqRecord.format(record, f).strip()
[docs] def write(self, filename=None, f="gb"): """Writes the Dseqrecord to a file using the format f, which must be a format supported by Biopython SeqIO for writing [#]_. Default is "gb" which is short for Genbank. Note that Biopython SeqIO reads more formats than it writes. Filename is the path to the file where the sequece is to be written. The filename is optional, if it is not given, the description property (string) is used together with the format. If obj is the Dseqrecord object, the default file name will be: ``<obj.locus>.<f>`` Where <f> is "gb" by default. If the filename already exists and AND the sequence it contains is different, a new file name will be used so that the old file is not lost: ``<obj.locus>_NEW.<f>`` References ---------- .. [#] http://biopython.org/wiki/SeqIO """ msg = "" if not filename: filename = "{name}.{type}".format(name=self.locus, type=f) # generate a name if no name was given # if not isinstance(filename, str): # is filename a string??? # raise ValueError("filename has to be a string, got", type(filename)) name, ext = _os.path.splitext(filename) msg = f"<font face=monospace><a href='{filename}' target='_blank'>{filename}</a></font><br>" if not _os.path.isfile(filename): with open(filename, "w", encoding="utf8") as fp: fp.write(self.format(f)) else: from pydna.readers import read old_file = read(filename) if self.seq != old_file.seq: # If new sequence is different, the old file is # renamed with "_OLD_" suffix: oldmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat() tstmp = int(_time.time() * 1_000_000) old_filename = f"{name}_OLD_{tstmp}{ext}" _os.rename(filename, old_filename) with open(filename, "w", encoding="utf8") as fp: fp.write(self.format(f)) newmtime = _datetime.datetime.fromtimestamp(_os.path.getmtime(filename)).isoformat() msg = f""" <table style="padding:10px 10px; word-break:normal; border-color:#fe0000; border-collapse:collapse; border-spacing:1; font-family:monospace; font-size:large; font-weight:bold; text-align:left; border: 5px solid red;"> <thead> <tr style="color:#0000FF;border: 1px solid;text-align:left;"> <th style="color:#fe0000;border: 1px solid;text-align:center;font-size:xxx-large;text-align:left;">&#9888</th> <th style="color:#f56b00;border: 1px solid;text-align:left;" colspan="2">Sequence change</th> </tr> </thead> <tbody> <tr style="color:#0000FF;border: 1px solid;text-align:left;"> <td>Filename</td> <td style="color:#fe0000;border: 1px solid;text-align:left;"><a href='{filename}' target='_blank'>{filename}</a></td> <td style="color:#32cb00;border: 1px solid;text-align:left;"><a href='{old_filename}' target='_blank'>{old_filename}</a></td> </tr> <tr style="color:#0000FF;border: 1px solid;text-align:left;"> <td >Saved</td> <td style="color:#fe0000;border: 1px solid;text-align:left;">{newmtime}</td> <td style="color:#32cb00;border: 1px solid;text-align:left;">{oldmtime}</td> </tr> <tr style="color:#0000FF;border: 1px solid;text-align:left;"> <td>Length</td> <td style="color:#fe0000;border: 1px solid;text-align:left;">{len(self)}</td> <td style="color:#32cb00;border: 1px solid;text-align:left;">{len(old_file)}</td> </tr> <tr style="color:#0000FF;border: 1px solid;text-align:left;"> <td>SEGUID</td> <td style="color:#fe0000;border: 1px solid;text-align:left;">{self.seguid()}</td> <td style="color:#32cb00;border: 1px solid;text-align:left;">{old_file.seguid()}</td> </tr> </tbody> </table> """ elif "seguid" in old_file.annotations.get("comment", ""): pattern = r"(ldseguid|cdseguid)-(\S{27})(_[0-9]{4}-[0-9]{2}-[0-9]{2}T[0-9]{2}:[0-9]{2}:[0-9]{2}\.[0-9]{6}){0,1}" # seguid=NNNNNNNNNNNNNNNNNNNNNNNNNNN_2020-10-10T11:11:11.111111 oldstamp = _re.search(pattern, old_file.description) newstamp = _re.search(pattern, self.description) newdescription = self.description if oldstamp and newstamp: if oldstamp.group(0)[:35] == newstamp.group(0)[:35]: newdescription = newdescription.replace(newstamp.group(0), oldstamp.group(0)) elif oldstamp: newdescription += " " + oldstamp.group(0) newobj = _copy.copy(self) newobj.description = newdescription with open(filename, "w", encoding="utf8") as fp: fp.write(newobj.format(f)) else: with open(filename, "w", encoding="utf8") as fp: fp.write(self.format(f)) return _display_html(msg, raw=True)
[docs] def find(self, other): # TODO allow strings, seqs, seqrecords or Dseqrecords # TODO check for linearity of other, raise exception if not # TODO add tests and docstring for this method o = str(other.seq).upper() if not self.circular: s = str(self.seq).upper() else: # allow wrapping around origin s = str(self.seq).upper() + str(self.seq).upper()[: len(other) - 1] return s.find(o)
def __str__(self): return ("Dseqrecord\n" "circular: {}\n" "size: {}\n").format(self.circular, len(self)) + _SeqRecord.__str__( self ) def __contains__(self, other): if other.lower() in str(self.seq).lower(): return True else: s = self.seq.watson.replace(" ", "") ln = len(s) spc = 3 - ln % 3 if ln % 3 else 0 s = "n" * spc + s + "nnn" for frame in range(3): if other.lower() in _translate(s[frame : frame + spc + ln]).lower(): return True return False
[docs] def find_aminoacids(self, other): """ >>> from pydna.dseqrecord import Dseqrecord >>> s=Dseqrecord("atgtacgatcgtatgctggttatattttag") >>> s.seq.translate() Seq('MYDRMLVIF*') >>> "RML" in s True >>> "MMM" in s False >>> s.seq.rc().translate() Seq('LKYNQHTIVH') >>> "QHT" in s.rc() True >>> "QHT" in s False >>> slc = s.find_aa("RML") >>> slc slice(9, 18, None) >>> s[slc] Dseqrecord(-9) >>> code = s[slc].seq >>> code Dseq(-9) cgtatgctg gcatacgac >>> code.translate() Seq('RML') """ other = str(other).lower() assert self.seq.watson == "".join(self.seq.watson.split()) s = self.seq.watson ln = len(s) spc = 3 - ln % 3 if ln % 3 else 0 s = s + "n" * spc + "nnn" start = None for frame in range(3): try: start = _translate(s[frame : frame + ln + spc]).lower().index(other) break except ValueError: pass oh = self.seq.ovhg if self.seq.ovhg > 0 else 0 if start is None: return None # TODO return an emoty slice or False...? else: return slice(frame + start * 3 + oh, frame + (start + len(other)) * 3 + oh)
find_aa = find_aminoacids
[docs] def map_trace_files(self, pth, limit=25): # TODO allow path-like objects import glob traces = [] for name in glob.glob(pth): trace = SeqIO.read(name, "abi").lower() trace.annotations["filename"] = trace.fname = name traces.append(trace) if not traces: raise ValueError("No trace files found in {}".format(pth)) if hasattr(self.map_target, "step"): area = self.map_target elif hasattr(self.map_target, "extract"): area = slice(self.map_target.location.start, self.map_target.location.end) else: area = None # TODO allow other objects as well and do some checks on map target if area: self.matching_reads = [] self.not_matching_reads = [] target = str(self[area].seq).lower() target_rc = str(self[area].seq.rc()).lower() for trace in traces: if target in str(trace.seq) or target_rc in str(trace.seq): self.matching_reads.append(trace) else: self.not_matching_reads.append(trace) reads = self.matching_reads else: self.matching_reads = None self.not_matching_reads = None reads = traces matching_reads = [] for read_ in reads: matches = _common_sub_strings(str(self.seq).lower(), str(read_.seq), limit) if not matches: continue if len(matches) > 1: newmatches = [ matches[0], ] for i, x in enumerate(matches[1:]): g, f, h = matches[i] if g + h < x[0] and f + h < x[1]: newmatches.append(x) else: # len(matches)==1 newmatches = matches matching_reads.append(read_) if len(newmatches) > 1: ms = [] for m in newmatches: ms.append(_SimpleLocation(m[0], m[0] + m[2])) loc = _CompoundLocation(ms) else: a, b, c = newmatches[0] loc = _SimpleLocation(a, a + c) self.features.append( _SeqFeature( loc, qualifiers={"label": [read_.annotations["filename"]]}, type="trace", ) ) return [x.annotations["filename"] for x in matching_reads]
def __repr__(self): return "Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self)) def _repr_pretty_(self, p, cycle): p.text("Dseqrecord({}{})".format({True: "-", False: "o"}[not self.circular], len(self))) def __add__(self, other): if hasattr(other, "seq") and hasattr(other.seq, "watson"): other = _copy.deepcopy(other) other_five_prime = other.seq.five_prime_end() if other_five_prime[0] == "5'": # add other.seq.ovhg for f in other.features: f.location = f.location + other.seq.ovhg elif other_five_prime[0] == "3'": # subtract other.seq.ovhg (sign change) for f in other.features: f.location = f.location + (-other.seq.ovhg) answer = Dseqrecord(_SeqRecord.__add__(self, other)) answer.n = min(self.n, other.n) else: answer = Dseqrecord(_SeqRecord.__add__(self, Dseqrecord(other))) answer.n = self.n return answer def __mul__(self, number): if not isinstance(number, int): raise TypeError("TypeError: can't multiply Dseqrecord by non-int of type {}".format(type(number))) if self.circular: raise TypeError("TypeError: can't multiply circular Dseqrecord.") if number > 0: new = _copy.copy(self) for i in range(1, number): new += self return new else: return self.__class__("") def __getitem__(self, sl): """docstring.""" answer = Dseqrecord(_copy.copy(self)) answer.seq = self.seq.__getitem__(sl) # answer.seq.alphabet = self.seq.alphabet # breakpoint() sl_start = sl.start or 0 # 6 sl_stop = sl.stop or len(self.seq) # 1 if not self.circular or sl_start < sl_stop: # TODO: special case for sl_end == 0 in circular sequences # related to https://github.com/BjornFJohansson/pydna/issues/161 if self.circular and sl.stop == 0: sl = slice(sl.start, len(self.seq), sl.step) answer.features = super().__getitem__(sl).features elif self.circular and sl_start > sl_stop: answer.features = self.shifted(sl_start).features # origin-spanning features should only be included after shifting # in cases where the slice comprises the entire sequence, but then # sl_start == sl_stop and the second condition is not met answer.features = [ f for f in answer.features if ( _location_boundaries(f.location)[1] <= answer.seq.length and _location_boundaries(f.location)[0] < _location_boundaries(f.location)[1] ) ] elif self.circular and sl_start == sl_stop: cut = ((sl_start, 0), None) return self.apply_cut(cut, cut) else: answer = Dseqrecord("") identifier = "part_{id}".format(id=self.id) if answer.features: sf = max(answer.features, key=len) # default if "label" in sf.qualifiers: identifier = " ".join(sf.qualifiers["label"]) elif "note" in sf.qualifiers: identifier = " ".join(sf.qualifiers["note"]) answer.id = _identifier_from_string(identifier)[:16] answer.name = _identifier_from_string("part_{name}".format(name=self.name))[:16] return answer def __eq__(self, other): """docstring.""" try: if self.seq == other.seq and str(self.__dict__) == str(other.__dict__): return True except AttributeError: pass return False def __ne__(self, other): """docstring.""" return not self.__eq__(other) def __hash__(self): """__hash__ must be based on __eq__.""" return hash((str(self.seq).lower(), str(tuple(sorted(self.__dict__.items())))))
[docs] def linearize(self, *enzymes): """Similar to `:func:cut`. Throws an exception if there is not excactly one cut i.e. none or more than one digestion products. """ if not self.seq.circular: raise TypeError("Can only linearize circular molecules!") fragments = self.cut(*enzymes) if len(fragments) > 1: raise TypeError("More than one fragment is formed!") elif not fragments: raise TypeError("The enzyme(s) do not cut!") answer = fragments[0] answer.id = "{name}_lin".format(name=self.name) answer.name = answer.id[:16] return fragments[0]
[docs] def no_cutters(self, batch: _RestrictionBatch = None): """docstring.""" return self.seq.no_cutters(batch=batch or CommOnly)
[docs] def unique_cutters(self, batch: _RestrictionBatch = None): """docstring.""" return self.seq.unique_cutters(batch=batch or CommOnly)
[docs] def once_cutters(self, batch: _RestrictionBatch = None): """docstring.""" return self.seq.once_cutters(batch=batch or CommOnly)
[docs] def twice_cutters(self, batch: _RestrictionBatch = None): """docstring.""" return self.seq.twice_cutters(batch=batch or CommOnly)
[docs] def n_cutters(self, n=3, batch: _RestrictionBatch = None): """docstring.""" return self.seq.n_cutters(n=n, batch=batch or CommOnly)
[docs] def cutters(self, batch: _RestrictionBatch = None): """docstring.""" return self.seq.cutters(batch=batch or CommOnly)
[docs] def number_of_cuts(self, *enzymes): """The number of cuts by digestion with the Restriction enzymes contained in the iterable.""" return sum([len(enzyme.search(self.seq)) for enzyme in _flatten(enzymes)])
[docs] def cas9(self, RNA: str): """docstring.""" fragments = [] result = [] for target in (self.seq, self.seq.rc()): fragments = [self[sl.start : sl.stop] for sl in target.cas9(RNA)] result.append(fragments) return result
[docs] def reverse_complement(self): """Reverse complement. Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("ggaatt") >>> a Dseqrecord(-6) >>> a.seq Dseq(-6) ggaatt ccttaa >>> a.reverse_complement().seq Dseq(-6) aattcc ttaagg >>> See Also -------- pydna.dseq.Dseq.reverse_complement """ answer = type(self)(super().reverse_complement()) answer.name = "{}_rc".format(self.name[:13]) answer.description = self.description + "_rc" answer.id = self.id + "_rc" answer.seq.circular = self.seq.circular # answer.seq._linear = self.seq.linear return answer
rc = reverse_complement # @_memorize("pydna.dseqrecord.Dseqrecord.synced")
[docs] def synced(self, ref, limit=25): """This method returns a new circular sequence (Dseqrecord object), which has been rotated in such a way that there is maximum overlap between the sequence and ref, which may be a string, Biopython Seq, SeqRecord object or another Dseqrecord object. The reason for using this could be to rotate a new recombinant plasmid so that it starts at the same position after cloning. See the example below: Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("gaat", circular=True) >>> a.seq Dseq(o4) gaat ctta >>> d = a[2:] + a[:2] >>> d.seq Dseq(-4) atga tact >>> insert=Dseqrecord("CCC") >>> recombinant = (d+insert).looped() >>> recombinant.seq Dseq(o7) atgaCCC tactGGG >>> recombinant.synced(a).seq Dseq(o7) gaCCCat ctGGGta """ if not self.circular: raise TypeError("Only circular DNA can be synced!") newseq = _copy.copy(self) s = str(self.seq.watson).lower() s_rc = str(self.seq.crick).lower() if hasattr(ref, "seq"): r = ref.seq if hasattr(r, "watson"): r = str(r.watson).lower() else: r = str(r).lower() else: r = str(ref.lower()) lim = min(limit, limit * (len(s) // limit) + 1) c = _common_sub_strings(s + s, r, limit=lim) d = _common_sub_strings(s_rc + s_rc, r, limit=lim) c = [(x[0], x[2]) for x in c if x[1] == 0] d = [(x[0], x[2]) for x in d if x[1] == 0] if not c and not d: raise TypeError("There is no overlap between sequences!") if c: start, length = c.pop(0) else: start, length = 0, 0 if d: start_rc, length_rc = d.pop(0) else: start_rc, length_rc = 0, 0 if length_rc > length: start = start_rc newseq = newseq.rc() if start == 0: result = newseq else: result = newseq.shifted(start) _module_logger.info("synced") return result
[docs] def upper(self): """Returns an uppercase copy. >>> from pydna.dseqrecord import Dseqrecord >>> my_seq = Dseqrecord("aAa") >>> my_seq.seq Dseq(-3) aAa tTt >>> upper = my_seq.upper() >>> upper.seq Dseq(-3) AAA TTT >>> Returns ------- Dseqrecord Dseqrecord object in uppercase See also -------- pydna.dseqrecord.Dseqrecord.lower""" upper = _copy.deepcopy(self) upper.seq = upper.seq.upper() return upper
[docs] def lower(self): """>>> from pydna.dseqrecord import Dseqrecord >>> my_seq = Dseqrecord("aAa") >>> my_seq.seq Dseq(-3) aAa tTt >>> upper = my_seq.upper() >>> upper.seq Dseq(-3) AAA TTT >>> lower = my_seq.lower() >>> lower Dseqrecord(-3) >>> Returns ------- Dseqrecord Dseqrecord object in lowercase See also -------- pydna.dseqrecord.Dseqrecord.upper """ lower = _copy.deepcopy(self) lower.seq = lower.seq.lower() return lower
[docs] def orfs(self, minsize=300): """docstring.""" return tuple(Dseqrecord(self[x:y]) for x, y in self.seq.orfs(minsize=minsize))
[docs] def orfs_to_features(self, minsize=300): """docstring.""" features = [] for strand, s in ((1, self.seq), (-1, self.seq.rc())): for x, y in s.orfs(minsize=minsize): orf = self[x:y] prt = orf.translate() features.append( _SeqFeature( _SimpleLocation(x, y, strand=strand), type="CDS", qualifiers={ "note": f"{y - x}bp {(y - x) // 3}aa", "checksum": [orf.seguid() + " (DNA)", prt.seguid() + " (protein)"], "codon_start": 1, "transl_table": 11, "translation": str(prt.seq), }, ) ) return features
def _copy_to_clipboard(self, sequence_format): """docstring.""" from pyperclip import copy copy(self.format(sequence_format)) return None
[docs] def copy_gb_to_clipboard(self): """docstring.""" self._copy_to_clipboard("gb") return None
[docs] def copy_fasta_to_clipboard(self): """docstring.""" self._copy_to_clipboard("fasta") return None
[docs] def figure(self, feature=0, highlight="\x1b[48;5;11m", plain="\x1b[0m"): """docstring.""" if self.features: f = self.features[feature] locations = sorted(self.features[feature].location.parts, key=_SimpleLocation.start.fget) strand = f.location.strand else: locations = [_SimpleLocation(0, 0, 1)] strand = 1 ovhg = self.seq.ovhg + len(self.seq.watson) - len(self.seq.crick) w = f"{self.seq.ovhg * chr(32)}{self.seq.watson}{-ovhg * chr(32)}" c = f"{-self.seq.ovhg * chr(32)}{self.seq.crick[::-1]}{ovhg * chr(32)}" if strand == 1: s1, s2 = w, c else: s1, s2 = c, w wfe = [f"{highlight}{s1[part.start:part.end]}{plain}" for part in locations] wfe.append("") wof = [s1[0 : locations[0].start]] for f, s in zip(locations, locations[1:]): wof.append(s1[f.end : s.start]) wof.append(s1[locations[-1].end : len(self)]) topology = {True: "-", False: "o"}[not self.circular] result = f"{self.__class__.__name__}({topology}{len(self)})\n" s1 = "".join(f + s for f, s in zip(wof, wfe)) if strand == 1: result += f"{s1}\n{s2}" else: result += f"{s2}\n{s1}" return _pretty_str(result)
[docs] def shifted(self, shift): """Circular Dseqrecord with a new origin <shift>. This only works on circular Dseqrecords. If we consider the following circular sequence: | ``GAAAT <-- watson strand`` | ``CTTTA <-- crick strand`` The T and the G on the watson strand are linked together as well as the A and the C of the of the crick strand. if ``shift`` is 1, this indicates a new origin at position 1: | new origin at the | symbol: | | ``G|AAAT`` | ``C|TTTA`` new sequence: | ``AAATG`` | ``TTTAC`` Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("aaat",circular=True) >>> a Dseqrecord(o4) >>> a.seq Dseq(o4) aaat ttta >>> b=a.shifted(1) >>> b Dseqrecord(o4) >>> b.seq Dseq(o4) aata ttat """ if not self.circular: raise TypeError("Sequence is linear, origin can only be " "shifted for circular sequences.\n") ln = len(self) if not shift % ln: return _copy.deepcopy(self) # shift is a multiple of ln or 0 else: shift %= ln # 0<=shift<=ln newseq = (self.seq[shift:] + self.seq[:shift]).looped() newfeatures = _copy.deepcopy(self.features) for feature in newfeatures: feature.location = _shift_location(feature.location, -shift, ln) newfeatures.sort(key=_operator.attrgetter("location.start")) answer = _copy.deepcopy(self) answer.features = newfeatures answer.seq = newseq return answer
[docs] def cut(self, *enzymes): """Digest a Dseqrecord object with one or more restriction enzymes. returns a list of linear Dseqrecords. If there are no cuts, an empty list is returned. See also :func:`Dseq.cut` Parameters ---------- enzymes : enzyme object or iterable of such objects A Bio.Restriction.XXX restriction object or iterable of such. Returns ------- Dseqrecord_frags : list list of Dseqrecord objects formed by the digestion Examples -------- >>> from pydna.dseqrecord import Dseqrecord >>> a=Dseqrecord("ggatcc") >>> from Bio.Restriction import BamHI >>> a.cut(BamHI) (Dseqrecord(-5), Dseqrecord(-5)) >>> frag1, frag2 = a.cut(BamHI) >>> frag1.seq Dseq(-5) g cctag >>> frag2.seq Dseq(-5) gatcc g """ cutsites = self.seq.get_cutsites(*enzymes) cutsite_pairs = self.seq.get_cutsite_pairs(cutsites) return tuple(self.apply_cut(*cs) for cs in cutsite_pairs)
[docs] def apply_cut(self, left_cut, right_cut): dseq = self.seq.apply_cut(left_cut, right_cut) # TODO: maybe remove depending on https://github.com/BjornFJohansson/pydna/issues/161 if left_cut == right_cut: # Not really a cut, but to handle the general case if left_cut is None: features = _copy.deepcopy(self.features) else: # The features that span the origin if shifting with left_cut, but that do not cross # the cut site should be included, and if there is a feature within the cut site, it should # be duplicated. See https://github.com/BjornFJohansson/pydna/issues/180 for a practical example. # # Let's say we are going to open a circular plasmid like below (| inidicate cuts, numbers indicate # features) # # 3333|3 # 1111 # 000 # XXXXatg|YYY # XXX|tacYYYY # 000 # 2222 # left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True) initial_shift = left_watson if left_ovhg < 0 else left_crick features = self.shifted(initial_shift).features # for f in features: # print(f.id, f.location, _location_boundaries(f.location)) # Here, we have done what's shown below (* indicates the origin). # The features 0 and 2 have the right location for the final product: # # 3*3333 # 1*111 # XXXX*atgYYY # XXXX*tacYYY # 000 # 2222 features_need_transfer = [ f for f in features if (_location_boundaries(f.location)[1] <= abs(left_ovhg)) ] features_need_transfer = [ _shift_feature(f, -abs(left_ovhg), len(self)) for f in features_need_transfer ] # ^ ^^^^^^^^^ # Now we have shifted the features that end before the cut (0 and 1, but not 3), as if # they referred to the below sequence (* indicates the origin): # # 1111 # 000 # XXXXatg*YYY # XXXXtac*YYY # # The features 0 and 1 would have the right location if the final sequence had the same length # as the original one. However, the final product is longer because of the overhang. features += [_shift_feature(f, abs(left_ovhg), len(dseq)) for f in features_need_transfer] # ^ ^^^^^^^^^ # So we shift back by the same amount in the opposite direction, but this time we pass the # length of the final product. # print(*features, sep='\n') # Features like 3 are removed here features = [ f for f in features if ( _location_boundaries(f.location)[1] <= len(dseq) and _location_boundaries(f.location)[0] <= _location_boundaries(f.location)[1] ) ] else: left_watson, left_crick, left_ovhg = self.seq.get_cut_parameters(left_cut, True) right_watson, right_crick, right_ovhg = self.seq.get_cut_parameters(right_cut, False) left_edge = left_crick if left_ovhg > 0 else left_watson right_edge = right_watson if right_ovhg > 0 else right_crick features = self[left_edge:right_edge].features return Dseqrecord(dseq, features=features)
if __name__ == "__main__": cache = _os.getenv("pydna_cache") _os.environ["pydna_cache"] = "nocache" import doctest doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS) # _os.environ["pydna_cache"] = cache