#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright 2013-2023 by Björn Johansson. All rights reserved.
# This code is part of the Python-dna distribution and governed by its
# license. Please see the LICENSE.txt file that should have been included
# as part of this package.
"""This module is based on the Py-rstr-max package that
was written by Romain Brixtel (rbrixtel_at_gmail_dot_com)
(https://brixtel.users.greyc.fr) and is available from
https://code.google.com/p/py-rstr-max
https://github.com/gip0/py-rstr-max
the original code was covered by an MIT licence."""
# from array import array as _array
# import itertools as _itertools
from operator import itemgetter as _itemgetter
from typing import List as _List, Tuple as _Tuple
Match = _Tuple[int, int, int] # (x_start, y_start, length)
# def _kark_sort(s, SA, n, K):
# def radixpass(a, b, r, s, n, k):
# c = _array("i", [0] * (k + 1))
# for i in range(n):
# c[r[a[i] + s]] += 1
# somme = 0
# for i in range(k + 1):
# freq, c[i] = c[i], somme
# somme += freq
# for i in range(n):
# b[c[r[a[i] + s]]] = a[i]
# c[r[a[i] + s]] += 1
# n0 = (n + 2) // 3
# n1 = (n + 1) // 3
# n2 = n // 3
# n02 = n0 + n2
# SA12 = _array("i", [0] * (n02 + 3))
# SA0 = _array("i", [0] * n0)
# s12 = [i for i in range(n + (n0 - n1)) if i % 3]
# s12.extend([0] * 3)
# s12 = _array("i", s12)
# radixpass(s12, SA12, s, 2, n02, K)
# radixpass(SA12, s12, s, 1, n02, K)
# radixpass(s12, SA12, s, 0, n02, K)
# name = 0
# c0, c1, c2 = -1, -1, -1
# for i in range(n02):
# if s[SA12[i]] != c0 or s[SA12[i] + 1] != c1 or s[SA12[i] + 2] != c2:
# name += 1
# c0 = s[SA12[i]]
# c1 = s[SA12[i] + 1]
# c2 = s[SA12[i] + 2]
# if SA12[i] % 3 == 1:
# s12[SA12[i] // 3] = name
# else:
# s12[SA12[i] // 3 + n0] = name
# if name < n02:
# _kark_sort(s12, SA12, n02, name + 1)
# for i in range(n02):
# s12[SA12[i]] = i + 1
# else:
# for i in range(n02):
# SA12[s12[i] - 1] = i
# s0 = _array("i", [SA12[i] * 3 for i in range(n02) if SA12[i] < n0])
# radixpass(s0, SA0, s, 0, n0, K)
# p = j = k = 0
# t = n0 - n1
# while k < n:
# i = SA12[t] * 3 + 1 if SA12[t] < n0 else (SA12[t] - n0) * 3 + 2
# j = SA0[p] if p < n0 else 0
# if SA12[t] < n0:
# test = (s12[SA12[t] + n0] <= s12[j // 3]) if (s[i] == s[j]) else (s[i] < s[j])
# elif s[i] == s[j]:
# test = s12[SA12[t] - n0 + 1] <= s12[j // 3 + n0] if (s[i + 1] == s[j + 1]) else s[i + 1] < s[j + 1]
# else:
# test = s[i] < s[j]
# if test:
# SA[k] = i
# t += 1
# if t == n02:
# k += 1
# while p < n0:
# SA[k] = SA0[p]
# p += 1
# k += 1
# else:
# SA[k] = j
# p += 1
# if p == n0:
# k += 1
# while t < n02:
# SA[k] = (SA12[t] * 3) + 1 if SA12[t] < n0 else ((SA12[t] - n0) * 3) + 2
# t += 1
# k += 1
# k += 1
# class _Rstr_max:
# def __init__(self):
# self.array_str = []
# def add_str(self, str_unicode):
# self.array_str.append(str_unicode)
# def _step1_sort_suffix(self):
# char_frontier = chr(2)
# self.global_suffix = char_frontier.join(self.array_str)
# nbChars = len(self.global_suffix)
# init = [-1] * nbChars
# self.idxString = _array("i", init)
# self.idxPos = _array("i", init)
# self.endAt = _array("i", init)
# k = idx = 0
# for mot in self.array_str:
# last = k + len(mot)
# for p in range(len(mot)):
# self.idxString[k] = idx
# self.idxPos[k] = p
# self.endAt[k] = last
# k += 1
# idx += 1
# k += 1
# s = self.global_suffix
# alphabet = [None] + sorted(set(s))
# k = len(alphabet)
# n = len(s)
# t = dict((c, i) for i, c in enumerate(alphabet))
# SA = _array("i", [0] * (n + 3))
# _kark_sort(_array("i", [t[c] for c in s] + [0] * 3), SA, n, k)
# self.res = SA[:n]
# def _step2_lcp(self):
# n = len(self.res)
# init = [0] * n
# rank = _array("i", init)
# LCP = _array("i", init)
# s = self.global_suffix
# suffix__array = self.res
# endAt = self.endAt
# for i in range(len(self.array_str), n):
# v = self.res[i]
# rank[v] = i
# l = 0
# for j in range(n):
# if l > 0:
# l -= 1
# i = rank[j]
# j2 = suffix__array[i - 1]
# if i:
# while l + j < endAt[j] and l + j2 < endAt[j2] and s[j + l] == s[j2 + l]:
# l += 1
# LCP[i - 1] = l
# else:
# l = 0
# self.lcp = LCP
# def _step3_rstr(self):
# prev_len = 0
# idx = 0
# results = {}
# len_lcp = len(self.lcp) - 1
# class Stack:
# pass
# stack = Stack()
# stack._top = 0
# stack.lst_max = []
# # if len(self.res) == 0 :
# # return {}
# pos1 = self.res[0]
# for idx in range(len_lcp):
# current_len = self.lcp[idx]
# pos2 = self.res[idx + 1]
# end_ = max(pos1, pos2) + current_len
# n = prev_len - current_len
# if n < 0:
# # pushMany
# stack.lst_max.append([-n, idx, end_])
# stack._top += -n
# elif n > 0:
# self.removeMany(stack, results, n, idx)
# elif stack._top > 0 and end_ > stack.lst_max[-1][-1]:
# # setMax
# stack.lst_max[-1][-1] = end_
# prev_len = current_len
# pos1 = pos2
# if stack._top > 0:
# self.removeMany(stack, results, stack._top, idx + 1)
# return results
# def removeMany(self, stack, results, m, idxEnd):
# prevStart = -1
# while m > 0:
# n, idxStart, maxEnd = stack.lst_max.pop()
# if prevStart != idxStart:
# # idStr = self.idxString[maxEnd-1]
# # pos = self.idxPos[maxEnd-1]
# id_ = (maxEnd, idxEnd - idxStart + 1)
# if id_ not in results or results[id_][0] < stack._top:
# results[id_] = (stack._top, idxStart)
# prevStart = idxStart
# m -= n
# stack._top -= n
# if m < 0:
# stack.lst_max.append([-m, idxStart, maxEnd - n - m])
# stack._top -= m
# def go(self):
# self._step1_sort_suffix()
# self._step2_lcp()
# return self._step3_rstr()
# def common_sub_strings_py(stringx: str, stringy: str, limit=25):
# """Finds all common substrings between stringx and stringy
# longer than limit. This function is case sensitive.
# The substrings may overlap.
# returns a list of tuples describing the substrings
# The list is sorted longest -> shortest.
# Parameters
# ----------
# stringx : str
# stringy : str
# limit : int, optional
# Returns
# -------
# list of tuple
# [(startx1, starty1, length1),(startx2, starty2, length2), ...]
# startx1 = startposition in x, where substring 1 starts
# starty1 = position in y where substring 1 starts
# length1 = lenght of substring
# Examples
# --------
# >>> from pydna.common_sub_strings import common_sub_strings
# >>> common_sub_strings("gatgatttcggtagtta", "gtcagtatgtctatctatcgcg", limit=3)
# [(1, 6, 3), (7, 17, 3), (10, 4, 3), (12, 3, 3)]
# ::
# Overlaps Symbols
# (1, 6, 3) ---
# (7, 17, 3) +++
# (10, 4, 3) ...
# (12, 3, 3) ===
# ===
# gatgatttcggtagtta stringx
# --- +++...
# ...
# gtcagtatgtctatctatcgcg stringy
# ===--- +++
# """
# rstr = _Rstr_max()
# rstr.add_str("&".join((stringx, stringy)))
# r = rstr.go()
# match = {}
# for (offset_end, nb), (l, start_plage) in r.items():
# if l < limit:
# continue
# startsx = []
# startsy = []
# for o in range(start_plage, start_plage + nb):
# offset = rstr.idxPos[rstr.res[o]]
# if offset > len(stringx):
# startsy.append(offset - len(stringx) - 1)
# else:
# startsx.append(offset)
# for a, b in _itertools.product(startsx, startsy):
# match[(a, b)] = max(match.get((a, b)) or 0, l)
# match = [(key[0], key[1], val) for key, val in list(match.items())]
# match.sort()
# match.sort(key=_itemgetter(2), reverse=True)
# return match
[docs]def common_sub_strings(stringx: str, stringy: str, limit: int = 25) -> _List[Match]:
"""
Finds all common substrings between stringx and stringy, and returns
them sorted by length.
This function is case sensitive.
Parameters
----------
stringx : str
stringy : str
limit : int, optional
Returns
-------
list of tuple
[(startx1, starty1, length1),(startx2, starty2, length2), ...]
startx1 = startposition in x, where substring 1 starts
starty1 = position in y where substring 1 starts
length1 = lenght of substring
"""
from pydivsufsort import common_substrings
matches = common_substrings(stringx, stringy, limit=limit)
matches.sort()
matches.sort(key=_itemgetter(2), reverse=True)
return matches
[docs]def terminal_overlap(stringx: str, stringy: str, limit: int = 15) -> _List[Match]:
"""Finds the the flanking common substrings between stringx and stringy
longer than limit. This means that the results only contains substrings
that starts or ends at the the ends of stringx and stringy.
This function is case sensitive.
returns a list of tuples describing the substrings
The list is sorted longest -> shortest.
Parameters
----------
stringx : str
stringy : str
limit : int, optional
Returns
-------
list of tuple
[(startx1,starty1,length1),(startx2,starty2,length2), ...]
startx1 = startposition in x, where substring 1 starts
starty1 = position in y where substring 1 starts
length1 = lenght of substring
Examples
--------
>>> from pydna.common_sub_strings import terminal_overlap
>>> terminal_overlap("agctatgtatcttgcatcgta", "gcatcgtagtctatttgcttac", limit=8)
[(13, 0, 8)]
::
<-- 8 ->
<---- 13 --->
agctatgtatcttgcatcgta stringx
gcatcgtagtctatttgcttac stringy
0
"""
return [
m
for m in common_sub_strings(stringx, stringy, limit)
if (m[0] == 0 and m[1] + m[2] == len(stringy)) or (m[1] == 0 and m[0] + m[2] == len(stringx))
]
if __name__ == "__main__":
import os as _os
cached = _os.getenv("pydna_cached_funcs", "")
_os.environ["pydna_cached_funcs"] = ""
import doctest
doctest.testmod(verbose=True, optionflags=doctest.ELLIPSIS)
_os.environ["pydna_cached_funcs"] = cached