\documentclass{article}
\usepackage{amsfonts}
\newcommand{\set}[1]{\{#1\}}
\newcommand{\seq}[1]{\langle#1\rangle}
\newcommand{\size}[1]{|#1|}
\newcommand{\Smax}{S_{\it max}}
\newcommand{\Smin}{S_{\it min}}
\newcommand{\Sran}{S_{\it ran}}
\newcommand{\tmax}{t_{\it max}}
\newcommand{\tmin}{t_{\it min}}
\newcommand{\ALGORITHM}{{\bf algorithm}}
\newcommand{\WHILE}{{\bf while}}
\newcommand{\DO}{{\bf do}}
\newcommand{\ENDWHILE}{{\bf end while}}
\begin{document}
\section*{Problem 1: Definitions}
\subsubsection*{Simple Stochastic Games}
A \emph{simple stochastic game} (SSG) $G=\seq{\Smax,\Smin,\Sran,l,r}$
consists of the following components:
\begin{itemize}
\item
A set $\Smax$ of \emph{max states};
a set $\Smin$ of \emph{min states};
and a set $\Sran$ of \emph{random states}.
The three sets $\Smax$, $\Smin$, and $\Sran$ are pairwise disjoint.
We write $S=\Smax\cup\Smin\cup\Sran$ and $S^+=S\cup\set{\tmax,\tmin}$,
where $\tmax,\tmin\not\in S$ are special \emph{target states}.
\item
Two successor functions $l$: $S\rightarrow S^+$ and
$r$: $S\rightarrow S^+$.
\end{itemize}
A state $s\in S$ is \emph{deterministic} if $l(s)=r(s)$.
The SSG $G$ is a \emph{min MDP} if all states in $\Smax$ are deterministic;
a \emph{max MDP}, if all states in $\Smin$ are deterministic;
and a \emph{Markov chain}, if all states in $\Smax\cup\Smin$ are
deterministic.
\subsubsection*{Plays}
The game is played between two players, a max player and a min player.
Initially a token is placed on one of the states.
If the token is on a max state, then the max player moves it to one of
the two successor states;
if the token is on a min state, then the min player moves it to one of
the two successor states;
and if the token is on a random state, then it is moved with probability
$1/2$ to each of the two successor states.
This process is repeated until the token visits a target state.
If the token visits the target~$\tmax$, then the max player wins;
if the token visits~$\tmin$, then the min player wins;
and if the game is played forever without the token visiting a target, then
neither player wins.
Formally, a \emph{winning play} of the SSG $G$ is a finite sequence
$\omega=\seq{s_0,s_1,\ldots,s_k}$ of states ($k\geq 0$) such that
(1)~for all $0\leq i1/2$.
\subsubsection*{Determinacy}
Stopping SSGs are \emph{zero-sum} games, which means that if the max player
wins the game, then the min player loses, and vice versa.
This is because each winning play is either max winning or min winning, but
not both.
An important property of two-player zero-sum games is \emph{determinacy},
which means that if both players play optimally, then their respective
probabilities of winning add up to~1.
Stopping SSGs are indeed determined for positional strategies.
Formally, for every state $s\in S^+$ of a stopping SSG, define the
\emph{min value}
$$y_s=\min_{\pi\in\Pi}\max_{\sigma\in\Sigma}
\Pr[\Omega_s^{\sigma,\pi}\backslash W_s^{\sigma,\pi}]$$
to be the greatest probability of min winning from $s$ that the min player
can ensure against any max strategy.
Then it can be shown that $y_s=1-x_s$ for all states $s\in S^+$.
\subsubsection*{Value Improvement}
The max values of a stopping SSG satisfy the following
\emph{transition equations}, one for each state:
\begin{itemize}
\item
for every state $s\in\Smax$, we have $x_s=\max\set{x_{l(s)},x_{r(s)}}$;
\item
for every state $s\in\Smin$, we have $x_s=\min\set{x_{l(s)},x_{r(s)}}$;
\item
for every state $s\in\Sran$, we have
$x_s=0.5\cdot x_{l(s)}+0.5\cdot x_{r(s)}$;
\item
$x_{\tmax}=1$ and $x_{\tmin}=0$.
\end{itemize}
The transition equations can be used to compute all max values:
\begin{verse}
\ALGORITHM\ ValueImprovement\\
\quad Input: stopping SSG $G=\seq{\Smax,\Smin,\Sran,l,r}$.\\
\quad Output: max values $x_s$ for all states $s\in S^+$.\\
\smallskip
\quad $x_{\tmax}:=1$; $x_{\tmin}:=0$;\\
\quad $x_s:=1$ for all $s\in\Smax$;\\
\quad $x_s:=0$ for all $s\in\Smin$;\\
\quad $x_s:=0.5$ for all $s\in\Sran$;\\
\quad \WHILE\ some $x_s$ changes \DO\\
\qquad $x_s:=\max\set{x_{l(s)},x_{r(s)}}$ for all $s\in\Smax$;\\
\qquad $x_s:=\min\set{x_{l(s)},x_{r(s)}}$ for all $s\in\Smin$;\\
\qquad $x_s:=0.5\cdot x_{l(s)}+0.5\cdot x_{r(s)}$ for all $s\in\Sran$;\\
\qquad \ENDWHILE.
\end{verse}
The algorithm ValueImprovement is guaranteed to converge to the max values,
but it may require an exponential number of iterations of the while loop,
even in the special case that $G$ is a Markov chain
(can you find an example?).
However, if $G$ is a Markov chain or an MDP, then the transition equations
can be solved much more efficiently.
For Markov chains, we can directly solve the resulting system of linear
equations:
\begin{verse}
$x_s=x_{l(s)}$ for all $s\in\Smax\cup\Smin$;\\
$x_s=0.5\cdot x_{l(s)}+0.5\cdot x_{r(s)}$ for all $s\in\Sran$;\\
$x_{\tmax}=1$ and $x_{\tmin}=0$.
\end{verse}
For min MDPs, we can solve a linear optimization problem:
\begin{verse}
$x_s=x_{l(s)}$ for all $s\in\Smax$;\\
$x_s\leq x_{l(s)}$ and $x_s\leq x_{r(s)}$ for all $s\in\Smin$;\\
$x_s=0.5\cdot x_{l(s)}+0.5\cdot x_{r(s)}$ for all $s\in\Sran$;\\
$x_{\tmax}=1$ and $x_{\tmin}=0$;\\
maximize $\sum_{s\in\Smin}x_s$.
\end{verse}
It follows that, given a positional max strategy $\sigma\in\Sigma$, we can
compute on the min MDP $G^\sigma$ an optimal min strategy against $\sigma$
by linear programming, in polynomial time (how?).
Moreover, we can check if $\sigma$ is optimal against \emph{any} min
strategy in polynomial time (how?).
As a corollary, we conclude that the SSG decision problem is in
NP $\cap$ coNP (why?).
No polynomial-time agorithm is known for solving SSG games.
\subsubsection*{Random Permutations}
A \emph{random permutation} for an SSG is a permutation of the random
states.
We associate with every random permutation $p=\seq{s_1,\ldots,s_n}$ a
positional max strategy $\sigma_p\in\Sigma$, which intuitively behaves as
follows:
\begin{itemize}
\item
For all $0\leq i\leq n$, if $s$ is a state from which the max player
can ensure that a state in $\set{s_{i+1},\ldots,s_n,\tmax}$ is visited
without visiting a state in $\set{s_1,\ldots,s_i}$, then $\sigma_p(s)$
follows such a strategy.
\item
If $s$ is a state from which the max player cannot ensure that a state
in $\set{s_1,\ldots,s_n,\tmax}$ is visited, then $\sigma_p(s)$ is chosen
arbitrarily.
\end{itemize}
Given a random permutation~$p$, we can compute the strategy $\sigma(p)$ in
linear time (how?).
Gimbert and Horn (2008) have proved that for every stopping SSG, there
exists a random permutation $p$ such that $\sigma(p)$ is optimal against
\emph{any} min strategy.
Your task is to define and evaluate heuristics for constructing ``good''
random permutations.
\end{document}