Source code for pydna.common_sub_strings

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2023 by Björn Johansson.  All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license.  Please see the LICENSE.txt file that should have been included
# as part of this package.

"""This module is based on the Py-rstr-max package that
was written by Romain Brixtel (rbrixtel_at_gmail_dot_com)
(https://brixtel.users.greyc.fr) and is available from
https://code.google.com/p/py-rstr-max
https://github.com/gip0/py-rstr-max
the original code was covered by an MIT licence."""


# from array import array as _array
# import itertools as _itertools
from operator import itemgetter as _itemgetter
from typing import List as _List, Tuple as _Tuple

Match = _Tuple[int, int, int]  # (x_start, y_start, length)

# def _kark_sort(s, SA, n, K):
#     def radixpass(a, b, r, s, n, k):
#         c = _array("i", [0] * (k + 1))
#         for i in range(n):
#             c[r[a[i] + s]] += 1

#         somme = 0
#         for i in range(k + 1):
#             freq, c[i] = c[i], somme
#             somme += freq

#         for i in range(n):
#             b[c[r[a[i] + s]]] = a[i]
#             c[r[a[i] + s]] += 1

#     n0 = (n + 2) // 3
#     n1 = (n + 1) // 3
#     n2 = n // 3
#     n02 = n0 + n2

#     SA12 = _array("i", [0] * (n02 + 3))
#     SA0 = _array("i", [0] * n0)

#     s12 = [i for i in range(n + (n0 - n1)) if i % 3]
#     s12.extend([0] * 3)
#     s12 = _array("i", s12)

#     radixpass(s12, SA12, s, 2, n02, K)
#     radixpass(SA12, s12, s, 1, n02, K)
#     radixpass(s12, SA12, s, 0, n02, K)

#     name = 0
#     c0, c1, c2 = -1, -1, -1
#     for i in range(n02):
#         if s[SA12[i]] != c0 or s[SA12[i] + 1] != c1 or s[SA12[i] + 2] != c2:
#             name += 1
#             c0 = s[SA12[i]]
#             c1 = s[SA12[i] + 1]
#             c2 = s[SA12[i] + 2]
#         if SA12[i] % 3 == 1:
#             s12[SA12[i] // 3] = name
#         else:
#             s12[SA12[i] // 3 + n0] = name

#     if name < n02:
#         _kark_sort(s12, SA12, n02, name + 1)
#         for i in range(n02):
#             s12[SA12[i]] = i + 1
#     else:
#         for i in range(n02):
#             SA12[s12[i] - 1] = i

#     s0 = _array("i", [SA12[i] * 3 for i in range(n02) if SA12[i] < n0])
#     radixpass(s0, SA0, s, 0, n0, K)

#     p = j = k = 0
#     t = n0 - n1
#     while k < n:
#         i = SA12[t] * 3 + 1 if SA12[t] < n0 else (SA12[t] - n0) * 3 + 2
#         j = SA0[p] if p < n0 else 0

#         if SA12[t] < n0:
#             test = (s12[SA12[t] + n0] <= s12[j // 3]) if (s[i] == s[j]) else (s[i] < s[j])
#         elif s[i] == s[j]:
#             test = s12[SA12[t] - n0 + 1] <= s12[j // 3 + n0] if (s[i + 1] == s[j + 1]) else s[i + 1] < s[j + 1]
#         else:
#             test = s[i] < s[j]
#         if test:
#             SA[k] = i
#             t += 1
#             if t == n02:
#                 k += 1
#                 while p < n0:
#                     SA[k] = SA0[p]
#                     p += 1
#                     k += 1
#         else:
#             SA[k] = j
#             p += 1
#             if p == n0:
#                 k += 1
#                 while t < n02:
#                     SA[k] = (SA12[t] * 3) + 1 if SA12[t] < n0 else ((SA12[t] - n0) * 3) + 2
#                     t += 1
#                     k += 1
#         k += 1


# class _Rstr_max:
#     def __init__(self):
#         self.array_str = []

#     def add_str(self, str_unicode):
#         self.array_str.append(str_unicode)

#     def _step1_sort_suffix(self):
#         char_frontier = chr(2)

#         self.global_suffix = char_frontier.join(self.array_str)

#         nbChars = len(self.global_suffix)
#         init = [-1] * nbChars
#         self.idxString = _array("i", init)
#         self.idxPos = _array("i", init)
#         self.endAt = _array("i", init)

#         k = idx = 0
#         for mot in self.array_str:
#             last = k + len(mot)
#             for p in range(len(mot)):
#                 self.idxString[k] = idx
#                 self.idxPos[k] = p
#                 self.endAt[k] = last
#                 k += 1
#             idx += 1
#             k += 1

#         s = self.global_suffix
#         alphabet = [None] + sorted(set(s))
#         k = len(alphabet)
#         n = len(s)
#         t = dict((c, i) for i, c in enumerate(alphabet))
#         SA = _array("i", [0] * (n + 3))
#         _kark_sort(_array("i", [t[c] for c in s] + [0] * 3), SA, n, k)
#         self.res = SA[:n]

#     def _step2_lcp(self):
#         n = len(self.res)
#         init = [0] * n
#         rank = _array("i", init)
#         LCP = _array("i", init)

#         s = self.global_suffix
#         suffix__array = self.res
#         endAt = self.endAt

#         for i in range(len(self.array_str), n):
#             v = self.res[i]
#             rank[v] = i

#         l = 0
#         for j in range(n):
#             if l > 0:
#                 l -= 1
#             i = rank[j]
#             j2 = suffix__array[i - 1]
#             if i:
#                 while l + j < endAt[j] and l + j2 < endAt[j2] and s[j + l] == s[j2 + l]:
#                     l += 1
#                 LCP[i - 1] = l
#             else:
#                 l = 0
#         self.lcp = LCP

#     def _step3_rstr(self):
#         prev_len = 0
#         idx = 0
#         results = {}
#         len_lcp = len(self.lcp) - 1

#         class Stack:
#             pass

#         stack = Stack()
#         stack._top = 0
#         stack.lst_max = []

#         # if len(self.res) == 0 :
#         #  return {}

#         pos1 = self.res[0]
#         for idx in range(len_lcp):
#             current_len = self.lcp[idx]
#             pos2 = self.res[idx + 1]
#             end_ = max(pos1, pos2) + current_len
#             n = prev_len - current_len
#             if n < 0:
#                 # pushMany
#                 stack.lst_max.append([-n, idx, end_])
#                 stack._top += -n
#             elif n > 0:
#                 self.removeMany(stack, results, n, idx)
#             elif stack._top > 0 and end_ > stack.lst_max[-1][-1]:
#                 # setMax
#                 stack.lst_max[-1][-1] = end_

#             prev_len = current_len
#             pos1 = pos2

#         if stack._top > 0:
#             self.removeMany(stack, results, stack._top, idx + 1)

#         return results

#     def removeMany(self, stack, results, m, idxEnd):
#         prevStart = -1
#         while m > 0:
#             n, idxStart, maxEnd = stack.lst_max.pop()
#             if prevStart != idxStart:
#                 # idStr = self.idxString[maxEnd-1]
#                 # pos = self.idxPos[maxEnd-1]
#                 id_ = (maxEnd, idxEnd - idxStart + 1)
#                 if id_ not in results or results[id_][0] < stack._top:
#                     results[id_] = (stack._top, idxStart)
#                 prevStart = idxStart
#             m -= n
#             stack._top -= n
#         if m < 0:
#             stack.lst_max.append([-m, idxStart, maxEnd - n - m])
#             stack._top -= m

#     def go(self):
#         self._step1_sort_suffix()
#         self._step2_lcp()
#         return self._step3_rstr()


# def common_sub_strings_py(stringx: str, stringy: str, limit=25):
#     """Finds all common substrings between stringx and stringy
#     longer than limit. This function is case sensitive.
#     The substrings may overlap.

#     returns a list of tuples describing the substrings
#     The list is sorted longest -> shortest.

#     Parameters
#     ----------
#     stringx : str
#     stringy : str
#     limit : int, optional

#     Returns
#     -------
#     list of tuple
#         [(startx1, starty1, length1),(startx2, starty2, length2), ...]

#         startx1 = startposition in x, where substring 1 starts
#         starty1 = position in y where substring 1 starts
#         length1 = lenght of substring


#     Examples
#     --------

#     >>> from pydna.common_sub_strings import common_sub_strings
#     >>> common_sub_strings("gatgatttcggtagtta", "gtcagtatgtctatctatcgcg", limit=3)
#     [(1, 6, 3), (7, 17, 3), (10, 4, 3), (12, 3, 3)]

#     ::

#         Overlaps   Symbols
#         (1, 6,  3)   ---
#         (7, 17, 3)   +++
#         (10, 4, 3)   ...
#         (12, 3, 3)   ===


#                     ===
#         gatgatttcggtagtta           stringx
#          ---   +++...

#             ...
#         gtcagtatgtctatctatcgcg      stringy
#            ===---        +++

#     """
#     rstr = _Rstr_max()
#     rstr.add_str("&".join((stringx, stringy)))
#     r = rstr.go()
#     match = {}
#     for (offset_end, nb), (l, start_plage) in r.items():
#         if l < limit:
#             continue
#         startsx = []
#         startsy = []
#         for o in range(start_plage, start_plage + nb):
#             offset = rstr.idxPos[rstr.res[o]]
#             if offset > len(stringx):
#                 startsy.append(offset - len(stringx) - 1)
#             else:
#                 startsx.append(offset)

#         for a, b in _itertools.product(startsx, startsy):
#             match[(a, b)] = max(match.get((a, b)) or 0, l)

#     match = [(key[0], key[1], val) for key, val in list(match.items())]

#     match.sort()

#     match.sort(key=_itemgetter(2), reverse=True)

#     return match


[docs]def common_sub_strings(stringx: str, stringy: str, limit: int = 25) -> _List[Match]: """ Finds all common substrings between stringx and stringy, and returns them sorted by length. This function is case sensitive. Parameters ---------- stringx : str stringy : str limit : int, optional Returns ------- list of tuple [(startx1, starty1, length1),(startx2, starty2, length2), ...] startx1 = startposition in x, where substring 1 starts starty1 = position in y where substring 1 starts length1 = lenght of substring """ from pydivsufsort import common_substrings matches = common_substrings(stringx, stringy, limit=limit) matches.sort() matches.sort(key=_itemgetter(2), reverse=True) return matches
[docs]def terminal_overlap(stringx: str, stringy: str, limit: int = 15) -> _List[Match]: """Finds the the flanking common substrings between stringx and stringy longer than limit. This means that the results only contains substrings that starts or ends at the the ends of stringx and stringy. This function is case sensitive. returns a list of tuples describing the substrings The list is sorted longest -> shortest. Parameters ---------- stringx : str stringy : str limit : int, optional Returns ------- list of tuple [(startx1,starty1,length1),(startx2,starty2,length2), ...] startx1 = startposition in x, where substring 1 starts starty1 = position in y where substring 1 starts length1 = lenght of substring Examples -------- >>> from pydna.common_sub_strings import terminal_overlap >>> terminal_overlap("agctatgtatcttgcatcgta", "gcatcgtagtctatttgcttac", limit=8) [(13, 0, 8)] :: <-- 8 -> <---- 13 ---> agctatgtatcttgcatcgta stringx gcatcgtagtctatttgcttac stringy 0 """ return [ m for m in common_sub_strings(stringx, stringy, limit) if (m[0] == 0 and m[1] + m[2] == len(stringy)) or (m[1] == 0 and m[0] + m[2] == len(stringx)) ]
if __name__ == "__main__": import os as _os cached = _os.getenv("pydna_cached_funcs", "") _os.environ["pydna_cached_funcs"] = "" import doctest doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS) _os.environ["pydna_cached_funcs"] = cached