2016-12-28 23:54:51 +01:00
|
|
|
\chapter{String algorithms}
|
|
|
|
|
2017-01-22 12:15:41 +01:00
|
|
|
\index{string}
|
|
|
|
\index{alphabet}
|
|
|
|
|
|
|
|
A string $s$ of length $n$
|
|
|
|
is a sequence of characters
|
|
|
|
$s[1],s[2],\ldots,s[n]$.
|
|
|
|
|
|
|
|
An \key{alphabet} is a set of characters
|
|
|
|
that may appear in strings.
|
|
|
|
For example, the alphabet
|
|
|
|
$\{\texttt{A},\texttt{B},\ldots,\texttt{Z}\}$
|
|
|
|
consists of the capital letters of English.
|
|
|
|
|
|
|
|
\index{substring}
|
|
|
|
|
|
|
|
A \key{substring} consists of consecutive
|
|
|
|
characters in a string.
|
|
|
|
The number of substrings in a string is $n(n+1)/2$.
|
|
|
|
For example, \texttt{ORITH} is a substring
|
|
|
|
in \texttt{ALGORITHM}, and it corresponds
|
|
|
|
to \texttt{ALG\underline{ORITH}M}.
|
|
|
|
|
|
|
|
\index{subsequence}
|
|
|
|
|
|
|
|
A \key{subsequence} is a subset of characters
|
|
|
|
in a string in their original order.
|
|
|
|
The number of subsequences in a string is $2^n-1$.
|
|
|
|
For example, \texttt{LGRHM} is a subsequece
|
|
|
|
in \texttt{ALGORITHM}, and it corresponds
|
|
|
|
to \texttt{A\underline{LG}O\underline{R}IT\underline{HM}}.
|
|
|
|
|
|
|
|
\index{prefix}
|
|
|
|
\index{suffix}
|
|
|
|
|
|
|
|
A \key{prefix} is a subtring that contains the first
|
|
|
|
character of a string,
|
|
|
|
and a \key{suffix} is a substring that contains the last character.
|
|
|
|
For example, the prefixes of
|
|
|
|
\texttt{STORY} are \texttt{S}, \texttt{ST},
|
|
|
|
\texttt{STO}, \texttt{STOR} and \texttt{STORY},
|
|
|
|
and the suffixes are \texttt{Y}, \texttt{RY},
|
|
|
|
\texttt{ORY}, \texttt{TORY} and \texttt{STORY}.
|
|
|
|
A prefix or a suffix is \key{proper}
|
|
|
|
if it is not the whole string.
|
|
|
|
|
|
|
|
\index{rotation}
|
|
|
|
|
|
|
|
A \key{rotation} can be generated by moving
|
|
|
|
characters one by one from the beginning to the end
|
|
|
|
in a string (or vice versa).
|
|
|
|
For example, the rotations of \texttt{STORY} are
|
|
|
|
\texttt{STORY},
|
|
|
|
\texttt{TORYS},
|
|
|
|
\texttt{ORYST},
|
|
|
|
\texttt{RYSTO} and
|
|
|
|
\texttt{YSTOR}.
|
|
|
|
|
|
|
|
\index{period}
|
|
|
|
|
|
|
|
A \key{period} is a prefix of a string such that
|
|
|
|
we can construct the string by repeating the period.
|
|
|
|
The last repetition may be partial and contain
|
|
|
|
only a prefix of the period.
|
|
|
|
Often it is interesting to find the \key{shortest period}
|
|
|
|
of a string.
|
|
|
|
For example, the shortest period of
|
|
|
|
\texttt{ABCABCA} is \texttt{ABC}.
|
|
|
|
In this case, we first repeat the period twice
|
|
|
|
and then partially.
|
|
|
|
|
|
|
|
\index{border}
|
|
|
|
|
|
|
|
A \key{border} is a string that is both
|
|
|
|
a prefix and a suffix of a string.
|
|
|
|
For example, the borders for \texttt{ABADABA}
|
|
|
|
are \texttt{A}, \texttt{ABA} and \texttt{ABADABA}.
|
|
|
|
Often we want to find the \key{longest border}
|
|
|
|
that is not the whole string.
|
|
|
|
|
|
|
|
\index{lexicographical order}
|
|
|
|
|
|
|
|
Usually we compare string using the \key{lexicographical order}
|
|
|
|
that corresponds to the alphabetical order.
|
|
|
|
It means that $x<y$ if either $x$ is a proper prefix of $y$,
|
|
|
|
or there is an index $k$ such that
|
|
|
|
$x[i]=y[i]$ when $i<k$ and $x[k]<y[k]$.
|
|
|
|
|
|
|
|
\section{Trie structure}
|
|
|
|
|
|
|
|
\index{trie}
|
|
|
|
|
|
|
|
A \key{trie} is a tree structure that
|
|
|
|
maintains a set of strings.
|
|
|
|
Strings are stored in a trie as chains
|
|
|
|
of characters that start at the root
|
|
|
|
of the tree.
|
|
|
|
If two strings have a common prefix,
|
|
|
|
they also share a chain in the tree.
|
|
|
|
|
|
|
|
For example, the following trie corresponds
|
|
|
|
to the set
|
|
|
|
$\{\texttt{CANAL},\texttt{CANDY},\texttt{THE},\texttt{THERE}\}$:
|
2016-12-28 23:54:51 +01:00
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.9]
|
|
|
|
\node[draw, circle] (1) at (0,20) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (2) at (-1.5,19) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (3) at (1.5,19) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (4) at (-1.5,17.5) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (5) at (-1.5,16) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (6) at (-2.5,14.5) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (7) at (-0.5,14.5) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (8) at (-2.5,13) {*};
|
|
|
|
\node[draw, circle] (9) at (-0.5,13) {*};
|
|
|
|
\node[draw, circle] (10) at (1.5,17.5) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (11) at (1.5,16) {*};
|
|
|
|
\node[draw, circle] (12) at (1.5,14.5) {$\phantom{1}$};
|
|
|
|
\node[draw, circle] (13) at (1.5,13) {*};
|
|
|
|
|
2017-01-22 12:15:41 +01:00
|
|
|
\path[draw,thick,->] (1) -- node[font=\small,label=\texttt{C}] {} (2);
|
|
|
|
\path[draw,thick,->] (1) -- node[font=\small,label=\texttt{T}] {} (3);
|
|
|
|
\path[draw,thick,->] (2) -- node[font=\small,label=left:\texttt{A}] {} (4);
|
|
|
|
\path[draw,thick,->] (4) -- node[font=\small,label=left:\texttt{N}] {} (5);
|
|
|
|
\path[draw,thick,->] (5) -- node[font=\small,label=left:\texttt{A}] {} (6);
|
|
|
|
\path[draw,thick,->] (5) -- node[font=\small,label=right:\texttt{D}] {} (7);
|
|
|
|
\path[draw,thick,->] (6) -- node[font=\small,label=left:\texttt{L}] {}(8);
|
|
|
|
\path[draw,thick,->] (7) -- node[font=\small,label=right:\texttt{Y}] {} (9);
|
|
|
|
\path[draw,thick,->] (3) -- node[font=\small,label=right:\texttt{H}] {} (10);
|
|
|
|
\path[draw,thick,->] (10) -- node[font=\small,label=right:\texttt{E}] {} (11);
|
2016-12-28 23:54:51 +01:00
|
|
|
\path[draw,thick,->] (11) -- node[font=\small,label=right:\texttt{R}] {} (12);
|
2017-01-22 12:15:41 +01:00
|
|
|
\path[draw,thick,->] (12) -- node[font=\small,label=right:\texttt{E}] {} (13);
|
2016-12-28 23:54:51 +01:00
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
2017-01-22 12:15:41 +01:00
|
|
|
The character * in a node means that
|
|
|
|
a string ends at the node.
|
|
|
|
This character is needed because a string
|
|
|
|
may be a prefix of another string.
|
|
|
|
For example, in this trie, \texttt{THE}
|
|
|
|
is a suffix of \texttt{THERE}.
|
|
|
|
|
|
|
|
Inserting and searching a string in a trie take $O(n)$ time
|
|
|
|
where $n$ is the length of the string.
|
|
|
|
Both operations can be implemented by
|
|
|
|
starting at the root node and following the
|
|
|
|
chain of characters that appear in the string.
|
|
|
|
If needed, new nodes will be added to the trie.
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
Tries can be used for searching both strings
|
2017-01-22 12:15:41 +01:00
|
|
|
and prefixes of strings.
|
2017-01-24 20:59:20 +01:00
|
|
|
In addition, it is possible to calculate numbers
|
|
|
|
of strings that correspond to each prefix,
|
|
|
|
which can be useful in some applications.
|
2017-01-22 12:15:41 +01:00
|
|
|
|
|
|
|
A trie can be stored as an array
|
2016-12-28 23:54:51 +01:00
|
|
|
\begin{lstlisting}
|
|
|
|
int t[N][A];
|
|
|
|
\end{lstlisting}
|
2017-01-22 12:15:41 +01:00
|
|
|
where $N$ is the maximum number of nodes
|
|
|
|
(the total length of the string to be stored)
|
|
|
|
and $A$ is the size of the alphabet.
|
|
|
|
The nodes of a trie are numbered
|
|
|
|
$1,2,3,\ldots$ so that the number of the root is 1,
|
|
|
|
and $\texttt{t}[s][c]$ is the next node in chain
|
|
|
|
from node $s$ using character $c$.
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\section{String hashing}
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\index{hashing}
|
|
|
|
\index{string hashing}
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\key{String hashing} is a technique that
|
|
|
|
allows us to efficiently check whether two
|
|
|
|
substrings in a string are equal.
|
|
|
|
The idea is to compare hash values of the
|
|
|
|
substrings instead of their individual characters.
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\subsubsection*{Calculating hash values}
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\index{hash value}
|
|
|
|
\index{polynomial hashing}
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
A \key{hash value} of a string is
|
|
|
|
a number that is calculated from the characters
|
|
|
|
of the string.
|
|
|
|
If two strings are the same,
|
|
|
|
their hash values are also the same,
|
|
|
|
which makes it possible to compare strings
|
|
|
|
based on their hash values.
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
A usual way to implement string hashing
|
|
|
|
is to use polynomial hashing, which means
|
|
|
|
that the hash value is calculated using the formula
|
2016-12-28 23:54:51 +01:00
|
|
|
\[(c[1] A^{n-1} + c[2] A^{n-2} + \cdots + c[n] A^0) \bmod B ,\]
|
2017-01-24 20:59:20 +01:00
|
|
|
where $c[1],c[2],\ldots,c[n]$
|
|
|
|
are the codes of the characters in the string,
|
|
|
|
and $A$ and $B$ are pre-chosen constants.
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
For example, the codes of the characters
|
|
|
|
in the string \texttt{ALLEY} are:
|
2016-12-28 23:54:51 +01:00
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\draw (0,0) grid (5,2);
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\node at (0.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (1.5, 1.5) {\texttt{L}};
|
|
|
|
\node at (2.5, 1.5) {\texttt{L}};
|
|
|
|
\node at (3.5, 1.5) {\texttt{E}};
|
|
|
|
\node at (4.5, 1.5) {\texttt{Y}};
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\node at (0.5, 0.5) {65};
|
|
|
|
\node at (1.5, 0.5) {76};
|
|
|
|
\node at (2.5, 0.5) {76};
|
|
|
|
\node at (3.5, 0.5) {69};
|
|
|
|
\node at (4.5, 0.5) {89};
|
2016-12-28 23:54:51 +01:00
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
If $A=3$ and $B=97$, the hash value
|
|
|
|
for the string \texttt{ALLEY} is
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\[(65 \cdot 3^4 + 76 \cdot 3^3 + 76 \cdot 3^2 + 69 \cdot 3^1 + 89 \cdot 3^0) \bmod 97 = 52.\]
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\subsubsection*{Preprocessing}
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
To efficiently calculate hash values of substrings,
|
|
|
|
we need to preprocess the string.
|
|
|
|
It turns out that using polynomial hashing,
|
|
|
|
we can calculate the hash value of any substring
|
|
|
|
in $O(1)$ time after an $O(n)$ time preprocessing.
|
2016-12-28 23:54:51 +01:00
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
The idea is to construct an array $h$ such that
|
|
|
|
$h[k]$ contains the hash value for the prefix
|
|
|
|
of the string that ends at index $k$.
|
|
|
|
The array values can be recursively calculated as follows:
|
2016-12-28 23:54:51 +01:00
|
|
|
\[
|
|
|
|
\begin{array}{lcl}
|
|
|
|
h[0] & = & 0 \\
|
|
|
|
h[k] & = & (h[k-1] A + c[k]) \bmod B \\
|
|
|
|
\end{array}
|
|
|
|
\]
|
2017-01-24 20:59:20 +01:00
|
|
|
In addition, we construct an array $p$
|
|
|
|
where $p[k]=A^k \bmod B$:
|
2016-12-28 23:54:51 +01:00
|
|
|
\[
|
|
|
|
\begin{array}{lcl}
|
|
|
|
p[0] & = & 1 \\
|
|
|
|
p[k] & = & (p[k-1] A) \bmod B. \\
|
|
|
|
\end{array}
|
|
|
|
\]
|
2017-01-24 20:59:20 +01:00
|
|
|
Constructing these arrays takes $O(n)$ time.
|
|
|
|
After this, the hash value for a substring
|
|
|
|
of the string
|
|
|
|
that begins at index $a$ and ends at index $b$
|
|
|
|
can be calculated in $O(1)$ time using the formula
|
2016-12-28 23:54:51 +01:00
|
|
|
\[(h[b]-h[a-1] p[b-a+1]) \bmod B.\]
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\subsubsection*{Using hash values}
|
|
|
|
|
|
|
|
We can efficiently compare strings using hash values.
|
|
|
|
Instead of comparing the real contents of the strings,
|
|
|
|
the idea is to compare their hash values.
|
|
|
|
If the hash values are equal,
|
|
|
|
the strings are \emph{probably} equal,
|
|
|
|
and if the hash values are different,
|
|
|
|
the strings are \emph{certainly} different.
|
|
|
|
|
|
|
|
Using hashing, we can often make a brute force
|
|
|
|
algorithm efficient.
|
|
|
|
As an example, let's consider a brute force
|
|
|
|
algorithm that calculates how many times
|
|
|
|
a string $p$ occurs as a substring in
|
|
|
|
a string $s$.
|
|
|
|
The algorithm goes through all locations
|
|
|
|
where $p$ can occur, and compares the strings
|
|
|
|
character by character.
|
|
|
|
The time complexity of such an algorithm is $O(n^2)$.
|
|
|
|
|
|
|
|
However, we can make the algorithm more efficient
|
|
|
|
using hashing, because the algorithm compares
|
|
|
|
substrings of strings.
|
|
|
|
Using hashing, each comparison only takes $O(1)$ time,
|
|
|
|
because only hash values of the strings are compared.
|
|
|
|
This results in an algorithm with time complexity $O(n)$,
|
|
|
|
which is the best possible time complexity for this problem.
|
|
|
|
|
|
|
|
By combining hashing and \emph{binary search},
|
|
|
|
it is also possible to check the lexicographic order of
|
|
|
|
two strings in logarithmic time.
|
|
|
|
This can be done by finding out the length
|
|
|
|
of the common prefix of the strings using binary search.
|
|
|
|
Once we know the common prefix,
|
|
|
|
the next character after the prefix
|
|
|
|
indicates the order of the strings.
|
|
|
|
|
|
|
|
\subsubsection*{Collisions and parameters}
|
|
|
|
|
|
|
|
\index{collision}
|
|
|
|
|
|
|
|
An evident risk in comparing hash values is
|
|
|
|
\key{collision}, which means that two strings have
|
|
|
|
different contents but equal hash values.
|
|
|
|
In this case, based on the hash values it seems that
|
|
|
|
the strings are equal, but in reality they aren't,
|
|
|
|
and the algorithm may give incorrect results.
|
|
|
|
|
|
|
|
Collisions are always possible,
|
|
|
|
because the number of different strings is larger
|
|
|
|
than the number of different hash values.
|
|
|
|
However, the probability of a collision is small
|
|
|
|
if the constants $A$ and $B$ are carefully chosen.
|
|
|
|
There are two goals: the hash values should be
|
|
|
|
evenly distributed for the strings,
|
|
|
|
and the number of different hash values should
|
|
|
|
be large enough.
|
|
|
|
|
|
|
|
A good solution is to use large random numbers
|
|
|
|
as constants.
|
|
|
|
A usual way is to choose constants that are
|
|
|
|
near $10^9$, for example
|
2016-12-28 23:54:51 +01:00
|
|
|
\[
|
|
|
|
\begin{array}{lcl}
|
|
|
|
A & = & 911382323 \\
|
|
|
|
B & = & 972663749 \\
|
|
|
|
\end{array}
|
|
|
|
\]
|
2017-01-24 20:59:20 +01:00
|
|
|
This choice ensures that the hash values
|
|
|
|
are distributed evenly enough in the range $0 \ldots B-1$.
|
|
|
|
The benefit in $10^9$ is that
|
|
|
|
the \texttt{long long} type can be used
|
|
|
|
for calculating the hash values,
|
|
|
|
because the products $AB$ and $BB$ fit in \texttt{long long}.
|
|
|
|
But is it enough to have $10^9$ different hash values?
|
|
|
|
|
|
|
|
Let's consider three scenarios where hashing can be used:
|
|
|
|
|
|
|
|
\textit{Scenario 1:} Strings $x$ and $y$ are compared with
|
|
|
|
each other.
|
|
|
|
The probability of a collision is $1/B$ assuming that
|
|
|
|
all hash values are equally probable.
|
|
|
|
|
|
|
|
\textit{Tapaus 2:} A string $x$ is compared with strings
|
2016-12-28 23:54:51 +01:00
|
|
|
$y_1,y_2,\ldots,y_n$.
|
2017-01-24 20:59:20 +01:00
|
|
|
The probability for one or more collisions is
|
2016-12-28 23:54:51 +01:00
|
|
|
|
|
|
|
\[1-(1-1/B)^n.\]
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
\textit{Tapaus 3:} Strings $x_1,x_2,\ldots,x_n$
|
|
|
|
are compared with each other.
|
|
|
|
The probability for one or more collisions is
|
2016-12-28 23:54:51 +01:00
|
|
|
\[ 1 - \frac{B \cdot (B-1) \cdot (B-2) \cdots (B-n+1)}{B^n}.\]
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
The following table shows the collision probabilities
|
|
|
|
when the value of $B$ varies and $n=10^6$:
|
2016-12-28 23:54:51 +01:00
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tabular}{rrrr}
|
2017-01-24 20:59:20 +01:00
|
|
|
constant $B$ & scenario 1 & scenario 2 & scenario 3 \\
|
2016-12-28 23:54:51 +01:00
|
|
|
\hline
|
|
|
|
$10^3$ & $0.001000$ & $1.000000$ & $1.000000$ \\
|
|
|
|
$10^6$ & $0.000001$ & $0.632121$ & $1.000000$ \\
|
|
|
|
$10^9$ & $0.000000$ & $0.001000$ & $1.000000$ \\
|
|
|
|
$10^{12}$ & $0.000000$ & $0.000000$ & $0.393469$ \\
|
|
|
|
$10^{15}$ & $0.000000$ & $0.000000$ & $0.000500$ \\
|
|
|
|
$10^{18}$ & $0.000000$ & $0.000000$ & $0.000001$ \\
|
|
|
|
\end{tabular}
|
|
|
|
\end{center}
|
|
|
|
|
2017-01-24 20:59:20 +01:00
|
|
|
The table shows that in scenario 1,
|
|
|
|
the probability of a collision is negligible
|
|
|
|
when $B \approx 10^9$.
|
|
|
|
In scenario 2, a collision is possible but the
|
|
|
|
probability is still quite small.
|
|
|
|
However, in scenario 3 the situation is very different:
|
|
|
|
a collision will almost always happen when
|
|
|
|
$B \approx 10^9$.
|
|
|
|
|
|
|
|
\index{birthday paradox}
|
|
|
|
|
|
|
|
The phenomenon in scenario 3 is known as the
|
|
|
|
\key{birthday paradox}: if there are $n$ people
|
|
|
|
in a room, the probability that some two people
|
|
|
|
have the same birthday is large even if $n$ is quite small.
|
|
|
|
In hashing, correspondingly, when all hash values are compared
|
|
|
|
with each other, the probability that some two
|
|
|
|
hash values are the same is large.
|
|
|
|
|
|
|
|
A good way to make the probability of a collision
|
|
|
|
smaller is to calculate \emph{multiple} hash values
|
|
|
|
using different parameters.
|
|
|
|
It is very unlikely that a collision would occur
|
|
|
|
in all hash values at the same time.
|
|
|
|
For example, two hash values with parameter
|
|
|
|
$B \approx 10^9$ corresponds to one hash
|
|
|
|
value with parameter $B \approx 10^{18}$,
|
|
|
|
which makes the probability of a collision very small.
|
|
|
|
|
|
|
|
Some people use constants $B=2^{32}$ and $B=2^{64}$,
|
|
|
|
which is convenient, because operations with 32 and 64
|
|
|
|
bit integers are calculated modulo $2^{32}$ and $2^{64}$.
|
|
|
|
However, this is not a good choice, because it is possible
|
|
|
|
to construct inputs that always generate collisions when
|
|
|
|
remainders of the form $2^x$ are used\footnote{
|
2016-12-28 23:54:51 +01:00
|
|
|
J. Pachocki ja Jakub Radoszweski:
|
|
|
|
''Where to use and how not to use polynomial string hashing''.
|
|
|
|
\textit{Olympiads in Informatics}, 2013.
|
|
|
|
}.
|
|
|
|
|
|
|
|
\section{Z-algoritmi}
|
|
|
|
|
|
|
|
\index{Z-algoritmi}
|
|
|
|
\index{Z-taulukko}
|
|
|
|
|
|
|
|
\key{Z-algoritmi} muodostaa merkkijonosta \key{Z-taulukon},
|
|
|
|
joka kertoo kullekin merkkijonon kohdalle,
|
|
|
|
mikä on pisin kyseisestä kohdasta alkava osajono,
|
|
|
|
joka on myös merkkijonon alkuosa.
|
|
|
|
Z-algoritmin avulla voi ratkaista tehokkaasti
|
|
|
|
monia merkkijonotehtäviä.
|
|
|
|
|
|
|
|
Z-algoritmi ja merkkijonohajautus ovat usein
|
|
|
|
vaihtoehtoisia tekniikoita, ja on makuasia,
|
|
|
|
kumpaa algoritmia käyttää.
|
|
|
|
Toisin kuin hajautus, Z-algoritmi toimii
|
|
|
|
varmasti oikein eikä siinä ole törmäysten riskiä.
|
|
|
|
Toisaalta Z-algoritmi on vaikeampi toteuttaa eikä
|
|
|
|
se sovellu kaikkeen samaan kuin hajautus.
|
|
|
|
|
|
|
|
\subsubsection*{Algoritmin toiminta}
|
|
|
|
|
|
|
|
Z-algoritmi muodostaa merkkijonolle Z-taulukon,
|
|
|
|
jonka jokaisessa kohdassa lukee,
|
|
|
|
kuinka pitkälle kohdasta
|
|
|
|
alkava osajono vastaa merkkijonon alkuosaa.
|
|
|
|
Esimerkiksi Z-taulukko
|
|
|
|
merkkijonolle \texttt{ACBACDACBACBACDA} on seuraava:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (1.5, 1.5) {\texttt{C}};
|
|
|
|
\node at (2.5, 1.5) {\texttt{B}};
|
|
|
|
\node at (3.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (4.5, 1.5) {\texttt{C}};
|
|
|
|
\node at (5.5, 1.5) {\texttt{D}};
|
|
|
|
\node at (6.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (7.5, 1.5) {\texttt{C}};
|
|
|
|
\node at (8.5, 1.5) {\texttt{B}};
|
|
|
|
\node at (9.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (10.5, 1.5) {\texttt{C}};
|
|
|
|
\node at (11.5, 1.5) {\texttt{B}};
|
|
|
|
\node at (12.5, 1.5) {\texttt{A}};
|
|
|
|
\node at (13.5, 1.5) {\texttt{C}};
|
|
|
|
\node at (14.5, 1.5) {\texttt{D}};
|
|
|
|
\node at (15.5, 1.5) {\texttt{A}};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {7};
|
|
|
|
\node at (10.5, 0.5) {0};
|
|
|
|
\node at (11.5, 0.5) {0};
|
|
|
|
\node at (12.5, 0.5) {2};
|
|
|
|
\node at (13.5, 0.5) {0};
|
|
|
|
\node at (14.5, 0.5) {0};
|
|
|
|
\node at (15.5, 0.5) {1};
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Esimerkiksi kohdassa 7 on arvo 5,
|
|
|
|
koska siitä alkava 5-merkkinen osajono
|
|
|
|
\texttt{ACBAC} on merkkijonon alkuosa,
|
|
|
|
mutta 6-merkkinen osajono \texttt{ACBACB}
|
|
|
|
ei ole enää merkkijonon alkuosa.
|
|
|
|
|
|
|
|
Z-algoritmi käy läpi merkkijonon
|
|
|
|
vasemmalta oikealle ja laskee
|
|
|
|
jokaisessa kohdassa,
|
|
|
|
kuinka pitkälle kyseisestä kohdasta alkava
|
|
|
|
osajono täsmää merkkijonon alkuun.
|
|
|
|
Algoritmi laskee yhteisen
|
|
|
|
alkuosan pituuden vertaamalla
|
|
|
|
merkkijonon alkua ja osajonon alkua toisiinsa.
|
|
|
|
|
|
|
|
Suoraviivaisesti toteutettuna
|
|
|
|
tällaisen algoritmin aikavaativuus olisi $O(n^2)$,
|
|
|
|
koska yhteiset alkuosat voivat olla pitkiä.
|
|
|
|
Z-algoritmissa on kuitenkin yksi tärkeä
|
|
|
|
optimointi, jonka ansiosta algoritmin
|
|
|
|
aikavaativuus on vain $O(n)$.
|
|
|
|
|
|
|
|
Ideana on pitää muistissa väliä $[x,y]$,
|
|
|
|
joka on aiemmin laskettu merkkijonon
|
|
|
|
alkuun täsmäävä väli, jossa $y$ on
|
|
|
|
mahdollisimman suuri.
|
|
|
|
Tällä välillä olevia
|
|
|
|
merkkejä ei tarvitse koskaan
|
|
|
|
verrata uudestaan
|
|
|
|
merkkijonon alkuun, vaan niitä koskevan
|
|
|
|
tiedon saa suoraan Z-taulukon lasketusta osasta.
|
|
|
|
|
|
|
|
Z-algoritmin aikavaativuus on $O(n)$,
|
|
|
|
koska algoritmi aloittaa merkki kerrallaan
|
|
|
|
vertailemisen vasta kohdasta $y+1$.
|
|
|
|
Jos merkit täsmäävät, kohta $y$
|
|
|
|
siirtyy eteenpäin
|
|
|
|
eikä algoritmin tarvitse enää
|
|
|
|
koskaan vertailla tätä kohtaa,
|
|
|
|
vaan algoritmi pystyy hyödyntämään
|
|
|
|
Z-taulukon alussa olevaa tietoa.
|
|
|
|
|
|
|
|
\subsubsection*{Esimerkki}
|
|
|
|
|
|
|
|
Katsotaan nyt, miten Z-algoritmi muodostaa
|
|
|
|
seuraavan Z-taulukon:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {?};
|
|
|
|
\node at (2.5, 0.5) {?};
|
|
|
|
\node at (3.5, 0.5) {?};
|
|
|
|
\node at (4.5, 0.5) {?};
|
|
|
|
\node at (5.5, 0.5) {?};
|
|
|
|
\node at (6.5, 0.5) {?};
|
|
|
|
\node at (7.5, 0.5) {?};
|
|
|
|
\node at (8.5, 0.5) {?};
|
|
|
|
\node at (9.5, 0.5) {?};
|
|
|
|
\node at (10.5, 0.5) {?};
|
|
|
|
\node at (11.5, 0.5) {?};
|
|
|
|
\node at (12.5, 0.5) {?};
|
|
|
|
\node at (13.5, 0.5) {?};
|
|
|
|
\node at (14.5, 0.5) {?};
|
|
|
|
\node at (15.5, 0.5) {?};
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Ensimmäinen mielenkiintoinen kohta tulee,
|
|
|
|
kun yhteisen alkuosan pituus on 5.
|
|
|
|
Silloin algoritmi laittaa muistiin
|
|
|
|
välin $[7,11]$ seuraavasti:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\fill[color=lightgray] (6,0) rectangle (7,1);
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {?};
|
|
|
|
\node at (8.5, 0.5) {?};
|
|
|
|
\node at (9.5, 0.5) {?};
|
|
|
|
\node at (10.5, 0.5) {?};
|
|
|
|
\node at (11.5, 0.5) {?};
|
|
|
|
\node at (12.5, 0.5) {?};
|
|
|
|
\node at (13.5, 0.5) {?};
|
|
|
|
\node at (14.5, 0.5) {?};
|
|
|
|
\node at (15.5, 0.5) {?};
|
|
|
|
|
|
|
|
\draw [decoration={brace}, decorate, line width=0.5mm] (6,3.00) -- (11,3.00);
|
|
|
|
|
|
|
|
\node at (6.5,3.50) {$x$};
|
|
|
|
\node at (10.5,3.50) {$y$};
|
|
|
|
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Välin $[7,11]$ hyötynä on, että algoritmi
|
|
|
|
voi sen avulla laskea seuraavat
|
|
|
|
Z-taulukon arvot nopeammin.
|
|
|
|
Koska välin $[7,11]$ merkit ovat samat
|
|
|
|
kuin merkkijonon alussa,
|
|
|
|
myös Z-taulukon arvoissa on vastaavuutta.
|
|
|
|
|
|
|
|
Ensinnäkin kohdissa 8 ja 9
|
|
|
|
tulee olla samat arvot kuin
|
|
|
|
kohdissa 2 ja 3,
|
|
|
|
koska väli $[7,11]$
|
|
|
|
vastaa väliä $[1,5]$:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\fill[color=lightgray] (7,0) rectangle (9,1);
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {?};
|
|
|
|
\node at (10.5, 0.5) {?};
|
|
|
|
\node at (11.5, 0.5) {?};
|
|
|
|
\node at (12.5, 0.5) {?};
|
|
|
|
\node at (13.5, 0.5) {?};
|
|
|
|
\node at (14.5, 0.5) {?};
|
|
|
|
\node at (15.5, 0.5) {?};
|
|
|
|
|
|
|
|
|
|
|
|
\draw [decoration={brace}, decorate, line width=0.5mm] (6,3.00) -- (11,3.00);
|
|
|
|
|
|
|
|
\node at (6.5,3.50) {$x$};
|
|
|
|
\node at (10.5,3.50) {$y$};
|
|
|
|
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
|
|
|
|
\draw[thick,<->] (7.5,-0.25) .. controls (7,-1.25) and (2,-1.25) .. (1.5,-0.25);
|
|
|
|
\draw[thick,<->] (8.5,-0.25) .. controls (8,-1.25) and (3,-1.25) .. (2.5,-0.25);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Seuraavaksi kohdasta 4 saa tietoa kohdan
|
|
|
|
10 arvon laskemiseksi.
|
|
|
|
Koska kohdassa 4 on arvo 2,
|
|
|
|
tämä tarkoittaa, että osajono
|
|
|
|
täsmää kohtaan $y=11$ asti,
|
|
|
|
mutta sen jälkeen on tutkimatonta
|
|
|
|
aluetta merkkijonossa.
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\fill[color=lightgray] (9,0) rectangle (10,1);
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {?};
|
|
|
|
\node at (10.5, 0.5) {?};
|
|
|
|
\node at (11.5, 0.5) {?};
|
|
|
|
\node at (12.5, 0.5) {?};
|
|
|
|
\node at (13.5, 0.5) {?};
|
|
|
|
\node at (14.5, 0.5) {?};
|
|
|
|
\node at (15.5, 0.5) {?};
|
|
|
|
|
|
|
|
\draw [decoration={brace}, decorate, line width=0.5mm] (6,3.00) -- (11,3.00);
|
|
|
|
|
|
|
|
\node at (6.5,3.50) {$x$};
|
|
|
|
\node at (10.5,3.50) {$y$};
|
|
|
|
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
\draw[thick,<->] (9.5,-0.25) .. controls (9,-1.25) and (4,-1.25) .. (3.5,-0.25);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Nyt algoritmi alkaa vertailla merkkejä
|
|
|
|
kohdasta $y+1=12$ alkaen merkki kerrallaan.
|
|
|
|
Algoritmi ei voi hyödyntää valmiina
|
|
|
|
Z-taulukossa olevaa tietoa, koska se ei ole vielä aiemmin
|
|
|
|
tutkinut merkkijonoa näin pitkälle.
|
|
|
|
Tuloksena osajonon pituudeksi tulee 7
|
|
|
|
ja väli $[x,y]$ päivittyy vastaavasti:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\fill[color=lightgray] (9,0) rectangle (10,1);
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {7};
|
|
|
|
\node at (10.5, 0.5) {?};
|
|
|
|
\node at (11.5, 0.5) {?};
|
|
|
|
\node at (12.5, 0.5) {?};
|
|
|
|
\node at (13.5, 0.5) {?};
|
|
|
|
\node at (14.5, 0.5) {?};
|
|
|
|
\node at (15.5, 0.5) {?};
|
|
|
|
|
|
|
|
\draw [decoration={brace}, decorate, line width=0.5mm] (9,3.00) -- (16,3.00);
|
|
|
|
|
|
|
|
\node at (9.5,3.50) {$x$};
|
|
|
|
\node at (15.5,3.50) {$y$};
|
|
|
|
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
% \draw[thick,<->] (9.5,-0.25) .. controls (9,-1.25) and (4,-1.25) .. (3.5,-0.25);
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
Tämän jälkeen kaikkien seuraavien Z-taulukon
|
|
|
|
arvojen laskemisessa pystyy hyödyntämään
|
|
|
|
jälleen välin $[x,y]$ antamaa tietoa
|
|
|
|
ja algoritmi saa Z-taulukon loppuun tulevat
|
|
|
|
arvot suoraan Z-taulukon alusta:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\draw (0,0) grid (16,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {C};
|
|
|
|
\node at (2.5, 1.5) {B};
|
|
|
|
\node at (3.5, 1.5) {A};
|
|
|
|
\node at (4.5, 1.5) {C};
|
|
|
|
\node at (5.5, 1.5) {D};
|
|
|
|
\node at (6.5, 1.5) {A};
|
|
|
|
\node at (7.5, 1.5) {C};
|
|
|
|
\node at (8.5, 1.5) {B};
|
|
|
|
\node at (9.5, 1.5) {A};
|
|
|
|
\node at (10.5, 1.5) {C};
|
|
|
|
\node at (11.5, 1.5) {B};
|
|
|
|
\node at (12.5, 1.5) {A};
|
|
|
|
\node at (13.5, 1.5) {C};
|
|
|
|
\node at (14.5, 1.5) {D};
|
|
|
|
\node at (15.5, 1.5) {A};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {2};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {0};
|
|
|
|
\node at (6.5, 0.5) {5};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {7};
|
|
|
|
\node at (10.5, 0.5) {0};
|
|
|
|
\node at (11.5, 0.5) {0};
|
|
|
|
\node at (12.5, 0.5) {2};
|
|
|
|
\node at (13.5, 0.5) {0};
|
|
|
|
\node at (14.5, 0.5) {0};
|
|
|
|
\node at (15.5, 0.5) {1};
|
|
|
|
|
|
|
|
\draw [decoration={brace}, decorate, line width=0.5mm] (9,3.00) -- (16,3.00);
|
|
|
|
|
|
|
|
\node at (9.5,3.50) {$x$};
|
|
|
|
\node at (15.5,3.50) {$y$};
|
|
|
|
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\node at (14.5, 2.5) {15};
|
|
|
|
\node at (15.5, 2.5) {16};
|
|
|
|
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
|
|
|
|
\subsubsection{Z-taulukon käyttäminen}
|
|
|
|
|
|
|
|
Ratkaistaan esimerkkinä tehtävä,
|
|
|
|
jossa laskettavana on,
|
|
|
|
montako kertaa merkkijono $p$
|
|
|
|
esiintyy osajonona merkkijonossa $s$.
|
|
|
|
Ratkaisimme tehtävän aiemmin tehokkaasti
|
|
|
|
merkkijonohajautuksen avulla,
|
|
|
|
ja nyt Z-algoritmi tarjoaa siihen
|
|
|
|
vaihtoehtoisen lähestymistavan.
|
|
|
|
|
|
|
|
Usein esiintyvä idea Z-algoritmin yhteydessä
|
|
|
|
on muodostaa merkkijono,
|
|
|
|
jonka osana on useita välimerkeillä
|
|
|
|
erotettuja merkkijonoja.
|
|
|
|
Tässä tehtävässä sopiva merkkijono on
|
|
|
|
$p$\texttt{\#}$s$,
|
|
|
|
jossa merkkijonojen $p$ ja $s$ välissä on
|
|
|
|
erikoismerkki \texttt{\#},
|
|
|
|
jota ei esiinny merkkijonoissa.
|
|
|
|
Nyt merkkijonoa $p$\texttt{\#}$s$
|
|
|
|
vastaava Z-taulukko kertoo,
|
|
|
|
missä kohdissa merkkijonoa $p$
|
|
|
|
esiintyy merkkijono $s$.
|
|
|
|
Tällaiset kohdat ovat tarkalleen ne
|
|
|
|
Z-taulukon kohdat, joissa on
|
|
|
|
merkkijonon $p$ pituus.
|
|
|
|
|
|
|
|
\begin{samepage}
|
|
|
|
Esimerkiksi jos $s=$\texttt{HATTIVATTI} ja $p=$\texttt{ATT},
|
|
|
|
niin Z-taulukosta tulee:
|
|
|
|
|
|
|
|
\begin{center}
|
|
|
|
\begin{tikzpicture}[scale=0.7]
|
|
|
|
\draw (0,0) grid (14,2);
|
|
|
|
|
|
|
|
\node at (0.5, 1.5) {A};
|
|
|
|
\node at (1.5, 1.5) {T};
|
|
|
|
\node at (2.5, 1.5) {T};
|
|
|
|
\node at (3.5, 1.5) {\#};
|
|
|
|
\node at (4.5, 1.5) {H};
|
|
|
|
\node at (5.5, 1.5) {A};
|
|
|
|
\node at (6.5, 1.5) {T};
|
|
|
|
\node at (7.5, 1.5) {T};
|
|
|
|
\node at (8.5, 1.5) {I};
|
|
|
|
\node at (9.5, 1.5) {V};
|
|
|
|
\node at (10.5, 1.5) {A};
|
|
|
|
\node at (11.5, 1.5) {T};
|
|
|
|
\node at (12.5, 1.5) {T};
|
|
|
|
\node at (13.5, 1.5) {I};
|
|
|
|
|
|
|
|
\node at (0.5, 0.5) {--};
|
|
|
|
\node at (1.5, 0.5) {0};
|
|
|
|
\node at (2.5, 0.5) {0};
|
|
|
|
\node at (3.5, 0.5) {0};
|
|
|
|
\node at (4.5, 0.5) {0};
|
|
|
|
\node at (5.5, 0.5) {3};
|
|
|
|
\node at (6.5, 0.5) {0};
|
|
|
|
\node at (7.5, 0.5) {0};
|
|
|
|
\node at (8.5, 0.5) {0};
|
|
|
|
\node at (9.5, 0.5) {0};
|
|
|
|
\node at (10.5, 0.5) {3};
|
|
|
|
\node at (11.5, 0.5) {0};
|
|
|
|
\node at (12.5, 0.5) {0};
|
|
|
|
\node at (13.5, 0.5) {0};
|
|
|
|
|
|
|
|
\footnotesize
|
|
|
|
\node at (0.5, 2.5) {1};
|
|
|
|
\node at (1.5, 2.5) {2};
|
|
|
|
\node at (2.5, 2.5) {3};
|
|
|
|
\node at (3.5, 2.5) {4};
|
|
|
|
\node at (4.5, 2.5) {5};
|
|
|
|
\node at (5.5, 2.5) {6};
|
|
|
|
\node at (6.5, 2.5) {7};
|
|
|
|
\node at (7.5, 2.5) {8};
|
|
|
|
\node at (8.5, 2.5) {9};
|
|
|
|
\node at (9.5, 2.5) {10};
|
|
|
|
\node at (10.5, 2.5) {11};
|
|
|
|
\node at (11.5, 2.5) {12};
|
|
|
|
\node at (12.5, 2.5) {13};
|
|
|
|
\node at (13.5, 2.5) {14};
|
|
|
|
\end{tikzpicture}
|
|
|
|
\end{center}
|
|
|
|
\end{samepage}
|
|
|
|
Taulukon kohdissa 6 ja 11 on luku 3,
|
|
|
|
mikä tarkoittaa, että \texttt{ATT}
|
|
|
|
esiintyy vastaavissa kohdissa merkkijonossa
|
|
|
|
\texttt{HATTIVATTI}.
|
|
|
|
|
|
|
|
Tuloksena olevan algoritmin aikavaativuus on
|
|
|
|
$O(n)$, koska riittää muodostaa Z-taulukko
|
|
|
|
ja käydä se läpi.
|