grad.tex 5.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165
  1. \documentclass{article}
  2. \usepackage{amsmath}
  3. \usepackage{tikz}
  4. \begin{document}
  5. \section{Gradient Descent}
  6. If we keep decreasing the $\epsilon$ in our Finite Difference approach we effectively get the Derivative of the Cost Function.
  7. \begin{align}
  8. C'(w) = \lim_{\epsilon \to 0}\frac{C(w + \epsilon) - C(w)}{\epsilon}
  9. \end{align}
  10. Let's compute the derivatives of all our models. Throughout the entire paper $n$ means the amount of samples in the training set.
  11. \subsection{Linear Model}
  12. \def\d{2.0}
  13. \begin{center}
  14. \begin{tikzpicture}
  15. \node (X) at ({-\d*0.75}, 0) {$x$};
  16. \node[shape=circle,draw=black] (N) at (0, 0) {$w$};
  17. \node (Y) at ({\d*0.75}, 0) {$y$};
  18. \path[->] (X) edge (N);
  19. \path[->] (N) edge (Y);
  20. \end{tikzpicture}
  21. \end{center}
  22. \begin{align}
  23. y &= x \cdot w
  24. \end{align}
  25. \subsubsection{Cost}
  26. \begin{align}
  27. C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\
  28. C'(w)
  29. &= \left(\frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2\right)' = \\
  30. &= \frac{1}{n}\left(\sum_{i=1}^{n}(x_iw - y_i)^2\right)' \\
  31. &= \frac{1}{n}\sum_{i=1}^{n}\left((x_iw - y_i)^2\right)' \\
  32. &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i
  33. \end{align}
  34. \subsection{One Neuron Model with 2 inputs}
  35. \begin{center}
  36. \begin{tikzpicture}
  37. \node (X) at (-\d, 1) {$x$};
  38. \node (Y) at (-\d, -1) {$y$};
  39. \node[shape=circle,draw=black] (N) at (0, 0) {$\sigma, b$};
  40. \node (Z) at (\d, 0) {$z$};
  41. \path[->] (X) edge node[above] {$w_1$} (N);
  42. \path[->] (Y) edge node[above] {$w_2$} (N);
  43. \path[->] (N) edge (Z);
  44. \end{tikzpicture}
  45. \end{center}
  46. \begin{align}
  47. z &= \sigma(xw_1 + yw_2 + b) \\
  48. \sigma(x) &= \frac{1}{1 + e^{-x}} \\
  49. \sigma'(x) &= \sigma(x)(1 - \sigma(x))
  50. \end{align}
  51. \subsubsection{Cost}
  52. \def\pd[#1]{\partial_{#1}}
  53. \def\avgsum[#1,#2]{\frac{1}{#2}\sum_{#1=1}^{#2}}
  54. \begin{align}
  55. a_i &= \sigma(x_iw_1 + y_iw_2 + b) \\
  56. \pd[w_1]a_i
  57. &= \pd[w_1](\sigma(x_iw_1 + y_iw_2 + b)) = \\
  58. &= a_i(1 - a_i)\pd[w_1](x_iw_1 + y_iw_2 + b) = \\
  59. &= a_i(1 - a_i)x_i \\
  60. \pd[w_2]a_i &= a_i(1 - a_i)y_i \\
  61. \pd[b]a_i &= a_i(1 - a_i) \\
  62. C &= \avgsum[i, n](a_i - z_i)^2 \\
  63. \pd[w_1] C
  64. &= \avgsum[i, n]\pd[w_1]\left((a_i - z_i)^2\right) = \\
  65. &= \avgsum[i, n]2(a_i - z_i)\pd[w_1]a_i = \\
  66. &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)x_i \\
  67. \pd[w_2] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)y_i \\
  68. \pd[b] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)
  69. \end{align}
  70. \subsection{Two Neurons Model with 1 input}
  71. \begin{center}
  72. \begin{tikzpicture}
  73. \node (X) at (-\d, 0) {$x$};
  74. \node[shape=circle,draw=black] (N1) at (0, 0) {$\sigma, b^{(1)}$};
  75. \node[shape=circle,draw=black] (N2) at (\d, 0) {$\sigma, b^{(2)}$};
  76. \node (Y) at ({2*\d}, 0) {$y$};
  77. \path[->] (X) edge node[above] {$w^{(1)}$} (N1);
  78. \path[->] (N1) edge node[above] {$w^{(2)}$} (N2);
  79. \path[->] (N2) edge (Y);
  80. \end{tikzpicture}
  81. \end{center}
  82. \begin{align}
  83. a^{(1)} &= \sigma(xw^{(1)} + b^{(1)}) \\
  84. y &= \sigma(a^{(1)}w^{(2)} + b^{(2)})
  85. \end{align}
  86. The superscript in parenthesis denotes the current layer. For example $a_i^{(l)}$ denotes the activation from the $l$-th layer on $i$-th sample.
  87. \subsubsection{Feed-Forward}
  88. \begin{align}
  89. a_i^{(1)} &= \sigma(x_iw^{(1)} + b^{(1)}) \\
  90. \pd[w^{(1)}]a_i^{(1)} &= a_i^{(1)}(1 - a_i^{(1)})x_i \\
  91. \pd[b^{1}]a_i^{(1)} &= a_i^{(1)}(1 - a_i^{(1)}) \\
  92. a_i^{(2)} &= \sigma(a_i^{(1)}w^{(2)} + b^{(2)}) \\
  93. \pd[w^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
  94. \pd[b^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)}) \\
  95. \pd[a_i^{(1)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})w^{(2)}
  96. \end{align}
  97. \subsubsection{Back-Propagation}
  98. \begin{align}
  99. C^{(2)} &= \avgsum[i, n] (a_i^{(2)} - y_i)^2 \\
  100. \pd[w^{(2)}] C^{(2)}
  101. &= \avgsum[i, n] \pd[w^{(2)}]((a_i^{(2)} - y_i)^2) = \\
  102. &= \avgsum[i, n] 2(a_i^{(2)} - y_i)\pd[w^{(2)}]a_i^{(2)} = \\
  103. &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
  104. \pd[b^{(2)}] C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)}) \\
  105. \pd[a_i^{(1)}]C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
  106. e_i &= a_i^{(1)} - \pd[a_i^{(1)}]C^{(2)} \\
  107. C^{(1)} &= \avgsum[i, n] (a_i^{(1)} - e_i)^2 \\
  108. \pd[w^{(1)}]C^{(1)}
  109. &= \pd[w^{(1)}]\left(\avgsum[i, n] (a_i^{(1)} - e_i)^2\right) =\\
  110. &= \avgsum[i, n] \pd[w^{(1)}]\left((a_i^{(1)} - e_i)^2\right) =\\
  111. &= \avgsum[i, n] 2(a_i^{(1)} - e_i)\pd[w^{(1)}]a_i^{(1)} =\\
  112. &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})a_i^{(1)}(1 - a_i^{(1)})x_i \\
  113. \pd[b^{1}]C^{(1)} &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})a_i^{(1)}(1 - a_i^{(1)})
  114. \end{align}
  115. \subsection{Arbitrary Neurons Model with 1 input}
  116. Let's assume that we have $m$ layers.
  117. \subsubsection{Feed-Forward}
  118. Let's assume that $a_i^{(0)}$ is $x_i$.
  119. \begin{align}
  120. a_i^{(l)} &= \sigma(a_i^{(l-1)}w^{(l)} + b^{(l)}) \\
  121. \pd[w^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} \\
  122. \pd[b^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)}) \\
  123. \pd[a_i^{(l-1)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})w^{(l)}
  124. \end{align}
  125. \subsubsection{Back-Propagation}
  126. Let's denote $a_i^{(m)} - y_i$ as $\pd[a_i^{(m)}]C^{(m+1)}$.
  127. \begin{align}
  128. C^{(l)} &= \avgsum[i, n] (\pd[a_i^{(l)}]C^{(l+1)})^2 \\
  129. \pd[w^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} =\\
  130. \pd[b^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)}) \\
  131. \pd[a_i^{(l-1)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})w^{(l)}
  132. \end{align}
  133. \end{document}