From 36409a6c18e0b08876da9e88735db5dd58788ee4 Mon Sep 17 00:00:00 2001 From: Markus Kaiser Date: Tue, 18 Sep 2018 17:15:54 +0200 Subject: [PATCH] Add description of artificial data set --- dynamic_dirichlet_deep_gp.tex | 136 +++++++++++++++++++++------------- preamble/abbreviations.tex | 1 + preamble/packages.tex | 2 +- zotero_export.bib | 24 ++++++ 4 files changed, 109 insertions(+), 54 deletions(-) diff --git a/dynamic_dirichlet_deep_gp.tex b/dynamic_dirichlet_deep_gp.tex index d64949a..3871080 100644 --- a/dynamic_dirichlet_deep_gp.tex +++ b/dynamic_dirichlet_deep_gp.tex @@ -195,7 +195,6 @@ A lower bound $\Ell_{\text{DMGP}}$ for the log-joint \log\Prob*{\mat{Y}, \mat{A \end{align} Due to the structure of~\cref{eq:variational_distribution}, the bound factorizes along the data, enabling stochastic optimization. This bound has complexity\Fun*{\Oh}{NM^2K}$to evaluate. -\todo[inline]{This holds even for binary$\mat{a_n}$since we need to calculate the$\mat{\alpha_n}$.} \subsection{Optimization of the Lower Bound} \label{subsec:computation} @@ -215,11 +214,45 @@ Using this relaxation causes multiple problems in practice. Most importantly, explaining data points as mixtures of modes can substantially simplify the learning problem while violating the modelling assumption that every data point was generated using exactly one mode. Because of this, special care must be taken during optimization to enforce the sparsity of$\Variat*{\mat{a_n}}$. +% +\begin{figure*}[t] + \centering + \begin{subfigure}{.475\linewidth} + \centering + \includestandalone{figures/semi_bimodal_joint} + \caption{ + \label{fig:semi_bimodal:a} + Predictive posterior. + } + \end{subfigure} + \hfill + \begin{subfigure}{.475\linewidth} + \centering + \includestandalone{figures/semi_bimodal_attrib} + \caption{ + \label{fig:semi_bimodal:b} + Posterior assignment probability. + } + \end{subfigure} + \caption{ + \label{fig:semi_bimodal} + Semi-Bimodal data. + } +\end{figure*} +% +\begin{figure}[t] + \centering + \includestandalone{figures/semi_bimodal_attrib_process} + \caption{ + \label{fig:semi_bimodal:c} + Semi bimodal attrib process. + } +\end{figure} +% To avoid this problem, we propose using a different relaxation based on additional stochasticity. Instead of directly using$\Variat*{\mat{a_n}}$to combine the$\mat{f_n^{\pix{k}}}$, we first draw a sample$\mat{\hat{a}_n}$from a Concrete random variable as suggested by \textcite{maddison_concrete_2016} whose parameters are given by$\Variat*{\mat{a_n}}$. Based on a temperature parameter$\lambda$, a Concrete random variable enforces sparsity but is also continuous and yields informative gradients using automatic differentiation. Samples from a Concrete random variable are unit vectors and for$\lambda \to 0$their distribution approaches a discrete distribution. -\todo[inline]{More details here?} Our approximate evaluation of the bound in \cref{eq:variational_bound} during optimization has multiple sources of stochasticity, all of which are unbiased. First, we approximate the expectations using Monte Carlo samples$\mat{\hat{f}_n^{\pix{k}}}$,$\mat{\hat{\alpha}_n^{\pix{k}}}$, and$\mat{\hat{a}_n}$. @@ -230,9 +263,12 @@ And second, the factorization of the bound along the data allows us to use mini- \label{subsec:predictions} Predictions for a test location$\mat{x_\ast}$are mixtures of$Kindependent Gaussians, given by \begin{align} +\begin{split} + \label{eq:predictive_posterior} \Variat*{\mat{f_\ast} \given \mat{x_\ast}} &= \int \sum_{k=1}^K \Variat*{a_\ast^{\pix{k}} \given \mat{x_\ast}} \Variat*{\mat{f_\ast^{\pix{k}}} \given \mat{x_\ast}} \diff \mat{a_\ast^{\pix{k}}}\\ &\approx \sum_{k=1}^K \hat{a}_\ast^{\pix{k}} \mat{\hat{f}_\ast^{\pix{k}}}. +\end{split} \end{align} The predictive posteriors of theK$modes$\Variat*{\mat{f_\ast^{\pix{k}}} \given \mat{x_\ast}}$are given by$K$independent shallow Gaussian processes and can be calculated analytically \parencite{hensman_gaussian_2013}. Samples from the predictive density over$\Variat*{\mat{a_\ast} \given \mat{x_\ast}}$can be obtained by sampling from the Gaussian process posteriors$\Variat*{\mat{\alpha_\ast^{\pix{k}}} \given \mat{x_\ast}}$and renormalizing the resulting vector$\mat{\alpha_\ast}$using the$\softmax$-function. @@ -241,59 +277,33 @@ The distribution$\Variat*{\mat{a_\ast} \given \mat{x_\ast}}$reflects the model \section{Experiments} \label{sec:experiments} +In this section we investigate the behavior of the DMGP model in multiple regression settings. +First, we apply the DMGP to an artificial data set and showcase how the different components of the model interact to identify unimodal and multimodal parts of the input space. +Second, we show how different priors on the different modes can be used to separate a signal from unrelated noise. +\todo{Reformulate in accordance with the introduction}And third, we investigate a data set which contains observations of two independent dynamical systems mixed together and show how the DMGP can recover information about both systems. + +We use an implementation of DMGP in TensorFlow \parencite{tensorflow2015-whitepaper} based on GPflow \parencite{matthews_gpflow_2017} and the implementation of doubly stochastic variational inference \parencite{salimbeni_doubly_2017}. -\subsection{Artificial data} + +\subsection{Artificial data set} \label{subsec:semi_bimodal} -\begin{figure}[t] - \centering - \includestandalone{figures/semi_bimodal_joint} - \caption{ - \label{fig:semi_bimodal_joint} - Semi bimodal joint. - } -\end{figure} -\begin{figure}[t] - \centering - \includestandalone{figures/semi_bimodal_attrib_process} - \caption{ - \label{fig:semi_bimodal_attrib_process} - Semi bimodal attrib process. - } -\end{figure} -\begin{figure}[t] - \centering - \includestandalone{figures/semi_bimodal_attrib} - \caption{ - \label{fig:semi_bimodal_attrib} - Semi bimodal attrib. - } -\end{figure} -\begin{itemize} - \item Semi-bimodal data to showcase the model? -\end{itemize} +To demonstrate inference in our model, we begin with an experiment based on an artificial data set. +The data, together with recovered posterior attributions, can be seen in \cref{fig:semi_bimodal:b}. +We uniformly sampled 350 data points in the interval$ x \in [-2\pi, 2\pi]$and obtain$y$as$y = \Fun{\sin}{x} - \delta \cdot 2 \Fun{\exp}{-0.5 \cdot (x-2)^2} + \epsilon$with a Bernoulli-distributed$\delta \sim \Fun{\Ber}{0.5}$to introduce a multi-modality and additive independent noise$\epsilon \sim \Gaussian*{0, 0.005^2}$. -\subsection{ChoiceNet data} +We use squared exponential kernels as priors for both for the$f^{\pix{k}}$and$\alpha^{\pix{k}}$and$M=25$pseudo-inputs in every GP. +\Cref{fig:semi_bimodal,fig:semi_bimodal:c} show the posterior of a bimodal DMGP applied to the data which correctly identified the underlying functions. +\Cref{fig:semi_bimodal:b} shows the posterior belief about about the assignments$\mat{A}$and illustrates that DMGP separated the input space in a part which is unimodal where all points have been assigned to the same mode and a bimodal part where both modes are necessary to explain the data. + +This distinction is explicitly represented in the model via the assignment processes$\mat{\alpha}$shown in \cref{fig:semi_bimodal:c}. +The model has learned that the second mode is irrelevant at, say,$x=-5$and predictions using \cref{eq:predictive_posterior} simplify to a standard GP posterior. +The DMGP is implicitly incentivsed to explain the data using only one mode if possible through the likelihood term of the inferred$\mat{a_n}$in \cref{eq:variational_bound}. +At$x = -10$it can be seen that both the two modes and the assignment processes start reverting to their respective priors away from the data. + + +\subsection{Robust Regression} \label{subsec:choicenet} -\begin{table*}[t] - \centering - \caption{ - \label{tab:choicenet} - Choicenet results. - } - \newcolumntype{Y}{>{\centering\arraybackslash}X} - \newcolumntype{Z}{>{\columncolor{sStone}\centering\arraybackslash}X} - \begin{tabularx}{\linewidth}{rY|YZZZZZZ} - \toprule - Outliers & DMGP (MLL) & DMGP (RMSE) & CN & MDN & MLP & GPR & LGPR & RGPR \\ - \midrule - 0\,\% & 2.86 & \textbf{0.008} & 0.034 & 0.028 & 0.039 & \textbf{0.008} & 0.022 & 0.017 \\ - 20\,\% & 2.71 & \textbf{0.008} & 0.022 & 0.087 & 0.413 & 0.280 & 0.206 & 0.013 \\ - 40\,\% & 2.12 & \textbf{0.005} & 0.018 & 0.565 & 0.452 & 0.447 & 0.439 & 1.322 \\ - 60\,\% & 0.874 & 0.031 & \textbf{0.023} & 0.645 & 0.636 & 0.602 & 0.579 & 0.738 \\ - 80\,\% & 0.126 & 0.128 & \textbf{0.084} & 0.778 & 0.829 & 0.779 & 0.777 & 1.523 \\ - \bottomrule - \end{tabularx} -\end{table*} +% \begin{figure*}[t] \centering \begin{subfigure}{.475\linewidth} @@ -320,6 +330,28 @@ The distribution$\Variat*{\mat{a_\ast} \given \mat{x_\ast}}$reflects the model In the posterior,$\mat{\alpha^{\pix{2}}} = 0$. } \end{figure*} +% +\begin{table*}[t] + \centering + \caption{ + \label{tab:choicenet} + Choicenet results. + } + \newcolumntype{Y}{>{\centering\arraybackslash}X} + \newcolumntype{Z}{>{\columncolor{sStone}\centering\arraybackslash}X} + \begin{tabularx}{\linewidth}{rY|YZZZZZZ} + \toprule + Outliers & DMGP (MLL) & DMGP (RMSE) & CN & MDN & MLP & GPR & LGPR & RGPR \\ + \midrule + 0\,\% & 2.86 & \textbf{0.008} & 0.034 & 0.028 & 0.039 & \textbf{0.008} & 0.022 & 0.017 \\ + 20\,\% & 2.71 & \textbf{0.008} & 0.022 & 0.087 & 0.413 & 0.280 & 0.206 & 0.013 \\ + 40\,\% & 2.12 & \textbf{0.005} & 0.018 & 0.565 & 0.452 & 0.447 & 0.439 & 1.322 \\ + 60\,\% & 0.874 & 0.031 & \textbf{0.023} & 0.645 & 0.636 & 0.602 & 0.579 & 0.738 \\ + 80\,\% & 0.126 & 0.128 & \textbf{0.084} & 0.778 & 0.829 & 0.779 & 0.777 & 1.523 \\ + \bottomrule + \end{tabularx} +\end{table*} +% \begin{itemize} \item Our results for the choicenet data + comparison \end{itemize} @@ -333,14 +365,12 @@ The distribution$\Variat*{\mat{a_\ast} \given \mat{x_\ast}}\$ reflects the model \item Compare to standard GP (bad), Bayesian NN? \end{itemize} - \section{Conclusion} \label{sec:conclusion} \begin{itemize} \item Our model ist the best model there ever was or will be. \end{itemize} - \nocite{*} \printbibliography diff --git a/preamble/abbreviations.tex b/preamble/abbreviations.tex index b4ab705..6d56b67 100644 --- a/preamble/abbreviations.tex +++ b/preamble/abbreviations.tex @@ -53,6 +53,7 @@ \DeclareMathOperator{\Q}{\mathcal{Q}} \DeclareMathOperator{\Norm}{\mathcal{N}} \DeclareMathOperator{\Multi}{\mathcal{M}} +\DeclareMathOperator{\Ber}{\mathcal{B}} \DeclareMathOperator{\Uni}{\mathbb{U}} \DeclareMathOperator{\Ind}{\mathbb{I}} \DeclareMathOperator{\GP}{\mathcal{GP}} diff --git a/preamble/packages.tex b/preamble/packages.tex index 93c127f..81a0752 100644 --- a/preamble/packages.tex +++ b/preamble/packages.tex @@ -22,7 +22,7 @@ \usepackage{microtype} % Layout -\usepackage[labelfont=bf, format=plain]{caption} +\usepackage[labelfont=bf, format=plain, indention=1.5em]{caption} \usepackage[skip=3pt]{subcaption} % Tables and lists diff --git a/zotero_export.bib b/zotero_export.bib index f35a5ac..d270538 100644 --- a/zotero_export.bib +++ b/zotero_export.bib @@ -40,6 +40,21 @@ file = {C:\\Users\\markus\\Zotero\\storage\\ILUA2D27\\Hathaway und Bezdek - 1993 - Switching regression models and fuzzy clustering.pdf;C:\\Users\\markus\\Zotero\\storage\\KT65LIIU\\236552.html} } +@article{kaiser_bayesian_2017, + archivePrefix = {arXiv}, + eprinttype = {arxiv}, + eprint = {1710.02766}, + primaryClass = {cs, stat}, + title = {Bayesian {{Alignments}} of {{Warped Multi}}-{{Output Gaussian Processes}}}, + url = {http://arxiv.org/abs/1710.02766}, + abstract = {We propose a novel Bayesian approach to modelling nonlinear alignments of time series based on latent shared information. We apply the method to the real-world problem of finding common structure in the sensor data of wind turbines introduced by the underlying latent and turbulent wind field. The proposed model allows for both arbitrary alignments of the inputs and non-parametric output warpings to transform the observations. This gives rise to multiple deep Gaussian process models connected via latent generating processes. We present an efficient variational approximation based on nested variational compression and show how the model can be used to extract shared information between dependent time series, recovering an interpretable functional decomposition of the learning problem. We show results for an artificial data set and real-world data of two wind turbines.}, + urldate = {2018-06-08}, + date = {2017-10-07}, + keywords = {Computer Science - Learning,Statistics - Machine Learning}, + author = {Kaiser, Markus and Otte, Clemens and Runkler, Thomas and Ek, Carl Henrik}, + file = {C:\\Users\\markus\\Zotero\\storage\\MJQDUDFP\\Kaiser et al. - 2017 - Bayesian Alignments of Warped Multi-Output Gaussia.pdf;C:\\Users\\markus\\Zotero\\storage\\UKGW6CEX\\1710.html} +} + @article{matthews_gpflow_2017, title = {{{GPflow}}: {{A Gaussian}} Process Library Using {{TensorFlow}}}, volume = {18}, @@ -220,4 +235,13 @@ file = {C:\\Users\\markus\\Zotero\\storage\\89SIZL5F\\Kingma et al. - 2015 - Variational Dropout and the Local Reparameterizati.pdf;C:\\Users\\markus\\Zotero\\storage\\VE5EGL5C\\5666-variational-dropout-and-the-local-reparameterization-trick.html} } +@article{tensorflow2015-whitepaper, + title = {{{TensorFlow}}: {{Large}}-{{Scale Machine Learning}} on {{Heterogeneous Systems}}}, + url = {https://www.tensorflow.org/}, + date = {2015}, + author = {Abadi, Mart́ın and Agarwal, Ashish and Barham, Paul and Brevdo, Eugene and Chen, Zhifeng and Citro, Craig and Corrado, Greg S. and Davis, Andy and Dean, Jeffrey and Devin, Matthieu and Ghemawat, Sanjay and Goodfellow, Ian and Harp, Andrew and Irving, Geoffrey and Isard, Michael and Jia, Yangqing and Jozefowicz, Rafal and Kaiser, Lukasz and Kudlur, Manjunath and Levenberg, Josh and Mané, Dandelion and Monga, Rajat and Moore, Sherry and Murray, Derek and Olah, Chris and Schuster, Mike and Shlens, Jonathon and Steiner, Benoit and Sutskever, Ilya and Talwar, Kunal and Tucker, Paul and Vanhoucke, Vincent and Vasudevan, Vijay and Viégas, Fernanda and Vinyals, Oriol and Warden, Pete and Wattenberg, Martin and Wicke, Martin and Yu, Yuan and Zheng, Xiaoqiang}, + file = {C:\\Users\\markus\\Zotero\\storage\\WLKK3HI7\\Abadi et al. - 2015 - TensorFlow Large-Scale Machine Learning on Hetero.pdf}, + note = {Software available from tensorflow.org} +} +