Browse Source

Cleanup grad.tex paper a bit

rexim 2 years ago
parent
commit
05f1f01851
2 changed files with 36 additions and 16 deletions
  1. BIN
      papers/grad.pdf
  2. 36 16
      papers/grad.tex

BIN
papers/grad.pdf


+ 36 - 16
papers/grad.tex

@@ -6,11 +6,33 @@
 \begin{document}
 \begin{document}
 \section{Gradient Descent}
 \section{Gradient Descent}
 
 
+If we keep decreasing the $\epsilon$ in our Finite Difference approach we effectively get the Derivative of the Cost Function.
+
 \begin{align}
 \begin{align}
   C'(w) = \lim_{\epsilon \to 0}\frac{C(w + \epsilon) - C(w)}{\epsilon}
   C'(w) = \lim_{\epsilon \to 0}\frac{C(w + \epsilon) - C(w)}{\epsilon}
 \end{align}
 \end{align}
 
 
-\subsection{``Twice''}
+Let's compute the derivatives of all our models. Throughout the entire paper $n$ means the amount of samples in the training set.
+
+\subsection{Linear Model}
+
+\def\d{2.0}
+
+\begin{center}
+  \begin{tikzpicture}
+    \node (X) at ({-\d*0.75}, 0) {$x$};
+    \node[shape=circle,draw=black] (N) at (0, 0) {$w$};
+    \node (Y) at ({\d*0.75}, 0) {$y$};
+    \path[->] (X) edge (N);
+    \path[->] (N) edge (Y);
+  \end{tikzpicture}
+\end{center}
+
+\begin{align}
+  y &= x \cdot w
+\end{align}
+
+\subsubsection{Cost}
 
 
 \begin{align}
 \begin{align}
   C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\
   C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\
@@ -18,18 +40,11 @@
        &= \left(\frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2\right)' = \\
        &= \left(\frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2\right)' = \\
        &= \frac{1}{n}\left(\sum_{i=1}^{n}(x_iw - y_i)^2\right)' \\
        &= \frac{1}{n}\left(\sum_{i=1}^{n}(x_iw - y_i)^2\right)' \\
        &= \frac{1}{n}\sum_{i=1}^{n}\left((x_iw - y_i)^2\right)' \\
        &= \frac{1}{n}\sum_{i=1}^{n}\left((x_iw - y_i)^2\right)' \\
-       &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
-\end{align}
-
-\begin{align}
-  C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\  
-  C'(w) &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
+       &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i
 \end{align}
 \end{align}
 
 
 \subsection{One Neuron Model with 2 inputs}
 \subsection{One Neuron Model with 2 inputs}
 
 
-\def\d{2.0}
-
 \begin{center}
 \begin{center}
   \begin{tikzpicture}
   \begin{tikzpicture}
     \node (X) at (-\d, 1) {$x$};
     \node (X) at (-\d, 1) {$x$};
@@ -44,7 +59,7 @@
 \begin{align}
 \begin{align}
   y &= \sigma(xw_1 + yw_2 + b) \\
   y &= \sigma(xw_1 + yw_2 + b) \\
   \sigma(x) &= \frac{1}{1 + e^{-x}} \\
   \sigma(x) &= \frac{1}{1 + e^{-x}} \\
-  \sigma'(x) &= \sigma(x)(1 - \sigma(x)) \\
+  \sigma'(x) &= \sigma(x)(1 - \sigma(x))
 \end{align}
 \end{align}
 
 
 \subsubsection{Cost}
 \subsubsection{Cost}
@@ -65,7 +80,7 @@
       &= \avgsum[i, n]2(a_i - z_i)\pd[w_1]a_i = \\
       &= \avgsum[i, n]2(a_i - z_i)\pd[w_1]a_i = \\
       &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)x_i \\
       &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)x_i \\
   \pd[w_2] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)y_i \\
   \pd[w_2] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)y_i \\
-  \pd[b] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i) \\
+  \pd[b] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)
 \end{align}
 \end{align}
 
 
 \subsection{Two Neurons Model with 1 input}
 \subsection{Two Neurons Model with 1 input}
@@ -87,7 +102,7 @@
   y &= \sigma(a^{(1)}w^{(2)} + b^{(2)})
   y &= \sigma(a^{(1)}w^{(2)} + b^{(2)})
 \end{align}
 \end{align}
 
 
-\subsubsection{Cost}
+\subsubsection{Feed-Forward}
 
 
 \begin{align}
 \begin{align}
   a_i^{(1)} &= \sigma(x_iw^{(1)} + b^{(1)}) \\
   a_i^{(1)} &= \sigma(x_iw^{(1)} + b^{(1)}) \\
@@ -96,7 +111,12 @@
   a_i^{(2)} &= \sigma(a_i^{(1)}w^{(2)} + b^{(2)}) \\
   a_i^{(2)} &= \sigma(a_i^{(1)}w^{(2)} + b^{(2)}) \\
   \pd[w^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
   \pd[w^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
   \pd[b^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)}) \\
   \pd[b^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)}) \\
-  \pd[a_i^{(1)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
+  \pd[a_i^{(1)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})w^{(2)}
+\end{align}
+
+\subsubsection{Back-Propagation}
+
+\begin{align}
   C^{(2)} &= \avgsum[i, n] (a_i^{(2)} - y_i)^2 \\
   C^{(2)} &= \avgsum[i, n] (a_i^{(2)} - y_i)^2 \\
   \pd[w^{(2)}] C^{(2)}
   \pd[w^{(2)}] C^{(2)}
             &= \avgsum[i, n] \pd[w^{(2)}]((a_i^{(2)} - y_i)^2) = \\
             &= \avgsum[i, n] \pd[w^{(2)}]((a_i^{(2)} - y_i)^2) = \\
@@ -111,7 +131,7 @@
             &= \avgsum[i, n] \pd[w^{1}]\left((a_1^{(i)} - e_i)^2\right) =\\
             &= \avgsum[i, n] \pd[w^{1}]\left((a_1^{(i)} - e_i)^2\right) =\\
             &= \avgsum[i, n] 2(a_1^{(i)} - e_i)\pd[w^{1}]a_1^{(i)} =\\
             &= \avgsum[i, n] 2(a_1^{(i)} - e_i)\pd[w^{1}]a_1^{(i)} =\\
             &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})x_i \\
             &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})x_i \\
-  \pd[b^{1}]C^{(1)} &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)}) \\
+  \pd[b^{1}]C^{(1)} &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})
 \end{align}
 \end{align}
 
 
 \subsection{Arbitrary Neurons Model with 1 input}
 \subsection{Arbitrary Neurons Model with 1 input}
@@ -126,7 +146,7 @@ Let's assume that $a_i^{(0)}$ is $x_i$.
   a_i^{(l)} &= \sigma(a_i^{(l-1)}w^{(l)} + b^{(l)}) \\
   a_i^{(l)} &= \sigma(a_i^{(l-1)}w^{(l)} + b^{(l)}) \\
   \pd[w^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} \\
   \pd[w^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} \\
   \pd[b^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)}) \\
   \pd[b^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)}) \\
-  \pd[a_i^{(l-1)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
+  \pd[a_i^{(l-1)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})w^{(l)}
 \end{align}
 \end{align}
 
 
 \subsubsection{Back-Propagation}
 \subsubsection{Back-Propagation}
@@ -137,7 +157,7 @@ Let's denote $a_i^{(m)} - y_i$ as $\pd[a_i^{(m)}]C^{(m+1)}$.
   C^{(l)} &= \avgsum[i, n] (\pd[a_i^{(l)}]C^{(l+1)})^2 \\
   C^{(l)} &= \avgsum[i, n] (\pd[a_i^{(l)}]C^{(l+1)})^2 \\
   \pd[w^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} =\\
   \pd[w^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} =\\
   \pd[b^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)}) \\
   \pd[b^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)}) \\
-  \pd[a_i^{(l-1)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
+  \pd[a_i^{(l-1)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})w^{(l)}
 \end{align}
 \end{align}
 
 
 \end{document}
 \end{document}