Source code for pydna.seqrecord

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

# Copyright 2013-2023 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.

"""
A subclass of the Biopython SeqRecord class.

Has a number of extra methods and uses
the :class:`pydna._pretty_str.pretty_str` class instread of str for a
nicer output in the IPython shell.
"""


from Bio.SeqFeature import SeqFeature as _SeqFeature
from pydna._pretty import pretty_str as _pretty_str

from pydna.seq import ProteinSeq as _ProteinSeq
from pydna.common_sub_strings import common_sub_strings as _common_sub_strings

from Bio.Data.CodonTable import TranslationError as _TranslationError
from Bio.SeqRecord import SeqRecord as _SeqRecord
from Bio.SeqFeature import SimpleLocation as _SimpleLocation
from Bio.SeqFeature import CompoundLocation as _CompoundLocation
from pydna.seq import Seq as _Seq
from pydna._pretty import PrettyTable as _PrettyTable

import re as _re
import pickle as _pickle
from copy import copy as _copy

from pydna import _PydnaWarning
from warnings import warn as _warn

import logging as _logging
import datetime

_module_logger = _logging.getLogger("pydna." + __name__)


[docs]class SeqRecord(_SeqRecord):
    """
    A subclass of the Biopython SeqRecord class.

    Has a number of extra methods and uses
    the :class:`pydna._pretty_str.pretty_str` class instread of str for a
    nicer output in the IPython shell.
    """

    def __init__(self, seq, *args, id="id", name="name", description="description", **kwargs):
        if isinstance(seq, str):
            seq = _Seq(seq)
        super().__init__(seq, *args, id=id, name=name, description=description, **kwargs)
        self._fix_attributes()

    def _fix_attributes(self):
        self.id = _pretty_str(self.id)
        self.name = _pretty_str(self.name)
        self.description = _pretty_str(self.description)

        self.annotations.update({"molecule_type": "DNA"})
        self.map_target = None

        if not hasattr(self.seq, "transcribe"):
            self.seq = _Seq(self.seq)

        self.seq._data = b"".join(self.seq._data.split())  # remove whitespaces
        self.annotations = {_pretty_str(k): _pretty_str(v) for k, v in self.annotations.items()}

[docs]    @classmethod
    def from_Bio_SeqRecord(clc, sr: _SeqRecord):
        """Creates a pydnaSeqRecord from a Biopython SeqRecord."""
        # https://stackoverflow.com/questions/15404256/changing-the-\
        # class-of-a-python-object-casting
        sr.__class__ = clc
        sr._fix_attributes()
        return sr

    @property
    def locus(self):
        """Alias for name property."""
        return self.name

    @locus.setter
    def locus(self, value):
        """Alias for name property."""
        if len(value) > 16:
            shortvalue = value[:16]
            _warn(
                ("locus property {} truncated" "to 16 chars {}").format(value, shortvalue),
                _PydnaWarning,
                stacklevel=2,
            )
            value = shortvalue
        self.name = value
        return

    @property
    def accession(self):
        """Alias for id property."""
        return self.id

    @accession.setter
    def accession(self, value):
        """Alias for id property."""
        self.id = value
        return

    @property
    def definition(self):
        """Alias for description property."""
        return self.description

    @definition.setter
    def definition(self, value):
        """Alias for id property."""
        self.description = value
        return

[docs]    def reverse_complement(self, *args, **kwargs):
        """Return the reverse complement of the sequence."""
        answer = super().reverse_complement(*args, **kwargs)
        answer = type(self).from_Bio_SeqRecord(answer)
        return answer

    rc = reverse_complement

[docs]    def isorf(self, table=1):
        """Detect if sequence is an open reading frame (orf) in the 5'-3'.

        direction.

        Translation tables are numbers according to the NCBI numbering [#]_.

        Parameters
        ----------
        table  : int
            Sets the translation table, default is 1 (standard code)

        Returns
        -------
        bool
            True if sequence is an orf, False otherwise.


        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord("atgtaa")
        >>> a.isorf()
        True
        >>> b=SeqRecord("atgaaa")
        >>> b.isorf()
        False
        >>> c=SeqRecord("atttaa")
        >>> c.isorf()
        False

        References
        ----------
        .. [#] http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c

        """
        try:
            self.seq.translate(table=table, cds=True)
        except _TranslationError:
            return False
        else:
            return True

[docs]    def translate(self):
        """docstring."""
        p = super().translate()
        return ProteinSeqRecord(_ProteinSeq(p.seq[:-1]))

[docs]    def add_colors_to_features_for_ape(self):
        """Assign colors to features.

        compatible with
        the `ApE editor <http://jorgensen.biology.utah.edu/wayned/ape/>`_.

        """
        cols = (
            "#66ffa3",
            "#84ff66",
            "#e0ff66",
            "#ffc166",
            "#ff6666",
            "#ff99d6",
            "#ea99ff",
            "#ad99ff",
            "#99c1ff",
            "#99ffff",
            "#99ffc1",
            "#adff99",
            "#eaff99",
            "#ffd699",
            "#ff9999",
            "#ffccea",
            "#f4ccff",
            "#d6ccff",
            "#cce0ff",
            "#ccffff",
            "#ccffe0",
            "#d6ffcc",
            "#f4ffcc",
            "#ffeacc",
            "#ffcccc",
            "#ff66c1",
            "#e066ff",
            "#8466ff",
            "#66a3ff",
            "#66ffff",
        )

        for i, f in enumerate(self.features):
            f.qualifiers["ApEinfo_fwdcolor"] = [cols[i % len(cols)]]
            f.qualifiers["ApEinfo_revcolor"] = [cols[::-1][i % len(cols)]]

[docs]    def add_feature(self, x=None, y=None, seq=None, type_="misc", strand=1, *args, **kwargs):
        """Add a feature of type misc to the feature list of the sequence.

        Parameters
        ----------
        x  : int
            Indicates start of the feature
        y  : int
            Indicates end of the feature

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord("atgtaa")
        >>> a.features
        []
        >>> a.add_feature(2,4)
        >>> a.features
        [SeqFeature(SimpleLocation(ExactPosition(2),
                                   ExactPosition(4),
                                   strand=1),
                    type='misc',
                    qualifiers=...)]
        """
        qualifiers = {}
        qualifiers.update(kwargs)

        if seq:
            if hasattr(seq, "seq"):
                seq = seq.seq
                if hasattr(seq, "watson"):
                    seq = str(seq.watson).lower()
                else:
                    seq = str(seq).lower()
            else:
                seq = str(seq).lower()
            x = self.seq.lower().find(seq)
            if x == -1:
                raise TypeError("Could not find {}".format(seq))
            y = x + len(seq)
        else:
            x = x or 0
            y = y or len(self)

        if "label" not in qualifiers:
            qualifiers["label"] = ["ft{}".format(y - x)]

            if self[x:y].isorf() or self[x:y].reverse_complement().isorf():
                qualifiers["label"] = ["orf{}".format(y - x)]

        try:
            location = _SimpleLocation(x, y, strand=strand)
        except ValueError as err:
            if self.circular:
                location = _CompoundLocation(
                    (
                        _SimpleLocation(x, self.seq.length, strand=strand),
                        _SimpleLocation(0, y, strand=strand),
                    )
                )
            else:
                raise err

        sf = _SeqFeature(location, type=type_, qualifiers=qualifiers)

        self.features.append(sf)

        """
        In [11]: a.seq.translate()
        Out[11]: Seq('K', ExtendedIUPACProtein())
        """

[docs]    def list_features(self):
        """Print ASCII table with all features.

        Examples
        --------
        >>> from pydna.seq import Seq
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord(Seq("atgtaa"))
        >>> a.add_feature(2,4)
        >>> print(a.list_features())
        +-----+---------------+-----+-----+-----+-----+------+------+
        | Ft# | Label or Note | Dir | Sta | End | Len | type | orf? |
        +-----+---------------+-----+-----+-----+-----+------+------+
        |   0 | L:ft2         | --> | 2   | 4   |   2 | misc |  no  |
        +-----+---------------+-----+-----+-----+-----+------+------+
        """
        x = _PrettyTable(["Ft#", "Label or Note", "Dir", "Sta", "End", "Len", "type", "orf?"])
        x.align["Ft#"] = "r"  # Left align
        x.align["Label or Note"] = "l"  # Left align
        x.align["Len"] = "r"
        x.align["Sta"] = "l"
        x.align["End"] = "l"
        x.align["type"] = "l"
        x.padding_width = 1  # One space between column edges and contents
        for i, sf in enumerate(self.features):
            try:
                lbl = sf.qualifiers["label"]
            except KeyError:
                try:
                    lbl = sf.qualifiers["note"]
                except KeyError:
                    lbl = "nd"
                else:
                    lbl = "N:{}".format(" ".join(lbl).strip())
            else:
                lbl = "L:{}".format(" ".join(lbl).strip())
            x.add_row(
                [
                    i,
                    lbl[:16],
                    {1: "-->", -1: "<--", 0: "---", None: "---"}[sf.location.strand],
                    sf.location.start,
                    sf.location.end,
                    len(sf),
                    sf.type,
                    {True: "yes", False: "no"}[
                        self.extract_feature(i).isorf() or self.extract_feature(i).reverse_complement().isorf()
                    ],
                ]
            )
        return x

[docs]    def extract_feature(self, n):
        """Extract feature and return a new SeqRecord object.

        Parameters
        ----------
        n  : int
        Indicates the feature to extract

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord("atgtaa")
        >>> a.add_feature(2,4)
        >>> b=a.extract_feature(0)
        >>> b
        SeqRecord(seq=Seq('gt'), id='ft2', name='part_name',
                  description='description', dbxrefs=[])
        """
        return self.features[n].extract(self)

[docs]    def sorted_features(self):
        """Return a list of the features sorted by start position.

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord("atgtaa")
        >>> a.add_feature(3,4)
        >>> a.add_feature(2,4)
        >>> print(a.features)
        [SeqFeature(SimpleLocation(ExactPosition(3), ExactPosition(4),
                                   strand=1),
                    type='misc', qualifiers=...),
         SeqFeature(SimpleLocation(ExactPosition(2), ExactPosition(4),
                                   strand=1),
                    type='misc', qualifiers=...)]
        >>> print(a.sorted_features())
        [SeqFeature(SimpleLocation(ExactPosition(2), ExactPosition(4),
                                   strand=1),
                    type='misc', qualifiers=...),
         SeqFeature(SimpleLocation(ExactPosition(3), ExactPosition(4),
                                   strand=1),
                    type='misc', qualifiers=...)]
        """
        return sorted(self.features, key=lambda x: x.location.start)

[docs]    def seguid(self):
        """Return the url safe SEGUID [#]_ for the sequence.

        This checksum is the same as seguid but with base64.urlsafe
        encoding instead of the normal base 64. This means that
        the characters + and / are replaced with - and _ so that
        the checksum can be a part of and URL or a filename.

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a=SeqRecord("gattaca")
        >>> a.seguid() # original seguid is +bKGnebMkia5kNg/gF7IORXMnIU
        'lsseguid=tp2jzeCM2e3W4yxtrrx09CMKa_8'

        References
        ----------
        .. [#] http://wiki.christophchamp.com/index.php/SEGUID
        """
        return self.seq.seguid()

[docs]    def comment(self, newcomment=""):
        """docstring."""
        result = self.annotations.get("comment", "")
        if newcomment:
            self.annotations["comment"] = (result + "\n" + newcomment).strip()
            result = _pretty_str(self.annotations["comment"])
        return result

[docs]    def datefunction():
        """docstring."""
        return datetime.datetime.now().replace(microsecond=0).isoformat()

[docs]    def stamp(self, now=datefunction, tool="pydna", separator=" ", comment=""):
        """Add seguid checksum to COMMENTS sections

        The checksum is stored in object.annotations["comment"].
        This shows in the COMMENTS section of a formatted genbank file.

        For blunt linear sequences:

        ``SEGUID <seguid>``

        For circular sequences:

        ``cSEGUID <seguid>``

        Fore linear sequences which are not blunt:

        ``lSEGUID <seguid>``


        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a = SeqRecord("aa")
        >>> a.stamp()
        'lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttjU'
        >>> a.annotations["comment"][:41]
        'pydna lsseguid=gBw0Jp907Tg_yX3jNgS4qQWttj'
        """
        chksum = self.seq.seguid()
        oldcomment = self.annotations.get("comment", "")
        oldstamp = _re.findall(r"..seguid=\S{27}", oldcomment)
        if oldstamp and oldstamp[0] == chksum:
            return _pretty_str(oldstamp[0])
        elif oldstamp:
            _warn(
                f"Stamp change.\nNew: {chksum}\nOld: {oldstamp[0]}",
                _PydnaWarning,
            )
        self.annotations["comment"] = (f"{oldcomment}\n" f"{tool} {chksum} {now()} {comment}").strip()
        return _pretty_str(chksum)

[docs]    def lcs(self, other, *args, limit=25, **kwargs):
        """Return the longest common substring between the sequence.

        and another sequence (other). The other sequence can be a string,
        Seq, SeqRecord, Dseq or DseqRecord.
        The method returns a SeqFeature with type "read" as this method
        is mostly used to map sequence reads to the sequence. This can be
        changed by passing a type as keyword with some other string value.

        Examples
        --------
        >>> from pydna.seqrecord import SeqRecord
        >>> a = SeqRecord("GGATCC")
        >>> a.lcs("GGATCC", limit=6)
        SeqFeature(SimpleLocation(ExactPosition(0),
                                  ExactPosition(6), strand=1),
                                  type='read',
                                  qualifiers=...)
        >>> a.lcs("GATC", limit=4)
        SeqFeature(SimpleLocation(ExactPosition(1),
                                  ExactPosition(5), strand=1),
                                  type='read',
                                  qualifiers=...)
        >>> a = SeqRecord("CCCCC")
        >>> a.lcs("GGATCC", limit=6)
        SeqFeature(None)

        """
        # longest_common_substring
        # https://biopython.org/wiki/ABI_traces
        if hasattr(other, "seq"):
            r = other.seq
            if hasattr(r, "watson"):
                r = str(r.watson).lower()
            else:
                r = str(r).lower()
        else:
            r = str(other.lower())

        olaps = _common_sub_strings(str(self.seq).lower(), r, limit=limit or 25)

        try:
            start_in_self, start_in_other, length = olaps.pop(0)
        except IndexError:
            result = _SeqFeature()
        else:
            label = "sequence" if not hasattr(other, "name") else other.name
            result = _SeqFeature(
                _SimpleLocation(start_in_self, start_in_self + length, strand=1),
                type=kwargs.get("type") or "read",
                qualifiers={
                    "label": [kwargs.get("label") or label],
                    "ApEinfo_fwdcolor": ["#DAFFCF"],
                    "ApEinfo_revcolor": ["#DFFDFF"],
                },
            )
        return result

[docs]    def gc(self):
        """Return GC content."""
        return self.seq.gc()

[docs]    def cai(self, organism="sce"):
        """docstring."""
        return self.seq.cai(organism=organism)

[docs]    def rarecodons(self, organism="sce"):
        """docstring."""
        sfs = []
        for slc in self.seq.rarecodons(organism):
            cdn = self.seq._data[slc].decode("ASCII")
            sfs.append(
                _SeqFeature(
                    _SimpleLocation(slc.start, slc.stop),
                    type=f"rare_codon_{organism}",
                    qualifiers={"label": [cdn]},
                )
            )
        return sfs

[docs]    def startcodon(self, organism="sce"):
        """docstring."""
        return self.seq.startcodon()

[docs]    def stopcodon(self, organism="sce"):
        """docstring."""
        return self.seq.stopcodon()

[docs]    def express(self, organism="sce"):
        """docstring."""
        return self.seq.express()

[docs]    def copy(self):
        """docstring."""
        return _copy(self)

    def __lt__(self, other):
        """docstring."""
        try:
            return str(self.seq) < str(other.seq)
        except AttributeError:
            # I don't know how to compare to other
            return NotImplemented

    def __gt__(self, other):
        """docstring."""
        try:
            return str(self.seq) > str(other.seq)
        except AttributeError:
            # I don't know how to compare to other
            return NotImplemented

    def __eq__(self, other):
        """docstring."""
        try:
            if self.seq == other.seq and str(self.__dict__) == str(other.__dict__):
                return True
        except AttributeError:
            pass
        return False

    def __ne__(self, other):
        """docstring."""
        return not self.__eq__(other)

    def __hash__(self):
        """__hash__ must be based on __eq__."""
        return hash((str(self.seq).lower(), str(tuple(sorted(self.__dict__.items())))))

    def __str__(self):
        """docstring."""
        return _pretty_str(super().__str__())

    def __repr__(self):
        """docstring."""
        return _pretty_str(super().__repr__())

    def __format__(self, format):
        """docstring."""

        def removeprefix(text, prefix):
            """Until Python 3.8 is dropped, then use str.removeprefix."""
            if text.startswith(prefix):
                return text[len(prefix) :]
            return text

        if format == "pydnafasta":
            return _pretty_str(
                f">{self.id} {len(self)} bp {dict(((True,'circular'),(False,'linear')))[self.seq.circular]}\n{str(self.seq)}\n"
            )
        if format == "primer":
            return _pretty_str(
                f">{self.id} {len(self)}-mer{removeprefix(self.description, self.name).strip()}\n{str(self.seq)}\n"
            )
        return _pretty_str(super().__format__(format))

    def __add__(self, other):
        """docstring."""
        answer = super().__add__(other)
        if answer.name == "<unknown name>":
            answer.name = "name"
        if answer.id == "<unknown id>":
            answer.id = "id"
        if answer.description == "<unknown description>":
            answer.description = "description"
        answer = type(self).from_Bio_SeqRecord(answer)
        return answer

    def __getitem__(self, index):
        """docstring."""
        from pydna.utils import (
            identifier_from_string as _identifier_from_string,
        )  # TODO: clean this up

        answer = super().__getitem__(index)
        if len(answer) < 2:
            return answer
        identifier = "part_{id}".format(id=self.id)
        if answer.features:
            sf = max(answer.features, key=len)  # default
            if "label" in sf.qualifiers:
                identifier = " ".join(sf.qualifiers["label"])
            elif "note" in sf.qualifiers:
                identifier = " ".join(sf.qualifiers["note"])
        answer.id = _identifier_from_string(identifier)[:16]
        answer.name = _identifier_from_string(f"part_{self.name}")[:16]
        return answer

    def __bool__(self):
        """Boolean value of an instance of this class (True).

        This behaviour is for backwards compatibility, since until the
        __len__ method was added, a SeqRecord always evaluated as True.

        Note that in comparison, a Seq object will evaluate to False if it
        has a zero length sequence.

        WARNING: The SeqRecord may in future evaluate to False when its
        sequence is of zero length (in order to better match the Seq
        object behaviour)!
        """
        return bool(self.seq)

[docs]    def dump(self, filename, protocol=None):
        """docstring."""
        from pathlib import Path

        pth = Path(filename)
        if not pth.suffix:
            pth = pth.with_suffix(".pickle")
        with open(pth, "wb") as f:
            _pickle.dump(self, f, protocol=protocol)
        return _pretty_str(pth)


[docs]class ProteinSeqRecord(SeqRecord):

[docs]    def reverse_complement(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

    rc = reverse_complement

[docs]    def isorf(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

[docs]    def gc(self):
        raise NotImplementedError("Not defined for protein.")

[docs]    def cai(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

[docs]    def rarecodons(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

[docs]    def startcodon(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

[docs]    def stopcodon(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

[docs]    def express(self, *args, **kwargs):
        raise NotImplementedError("Not defined for protein.")

    def __format__(self, format):
        """docstring."""
        return _pretty_str(_SeqRecord.__format__(self, format))


if __name__ == "__main__":
    import os as _os

    cached = _os.getenv("pydna_cached_funcs", "")
    _os.environ["pydna_cached_funcs"] = ""
    import doctest

    doctest.testmod(verbose=True, optionflags=(doctest.ELLIPSIS | doctest.NORMALIZE_WHITESPACE))
    _os.environ["pydna_cached_funcs"] = cached