2
0
rexim 2 жил өмнө
parent
commit
8fe7f5a530
5 өөрчлөгдсөн 223 нэмэгдсэн , 25 устгасан
  1. 3 1
      .gitignore
  2. 45 8
      gates.c
  3. BIN
      papers/grad.pdf
  4. 143 0
      papers/grad.tex
  5. 32 16
      twice.c

+ 3 - 1
.gitignore

@@ -1 +1,3 @@
-build/
+build/
+*.aux
+*.log

+ 45 - 8
gates.c

@@ -39,7 +39,7 @@ sample xor_train[] = {
     {1, 1, 0},
 };
 
-sample *train = xor_train;
+sample *train = and_train;
 size_t train_count = 4;
 
 float cost(float w1, float w2, float b)
@@ -56,6 +56,39 @@ float cost(float w1, float w2, float b)
     return result;
 }
 
+void dcost(float eps,
+           float w1, float w2, float b,
+           float *dw1, float *dw2, float *db)
+{
+    float c = cost(w1, w2, b);
+    *dw1 = (cost(w1 + eps, w2, b) - c)/eps;
+    *dw2 = (cost(w1, w2 + eps, b) - c)/eps;
+    *db  = (cost(w1, w2, b + eps) - c)/eps;
+}
+
+
+void gcost(float w1, float w2, float b,
+           float *dw1, float *dw2, float *db)
+{
+    *dw1 = 0;
+    *dw2 = 0;
+    *db = 0;
+    size_t n = train_count;
+    for (size_t i = 0; i < n; ++i) {
+        float xi = train[i][0];
+        float yi = train[i][1];
+        float zi = train[i][2];
+        float ai = sigmoidf(xi*w1 + yi*w2 + b);
+        float di = 2*(ai - zi)*ai*(1 - ai);
+        *dw1 += di*xi;
+        *dw2 += di*yi;
+        *db  += di;
+    }
+    *dw1 /= n;
+    *dw2 /= n;
+    *db /= n;
+}
+
 float rand_float(void)
 {
     return (float) rand()/ (float) RAND_MAX;
@@ -79,20 +112,24 @@ int main(void)
     float w2 = rand_float();
     float b  = rand_float();
 
-    float eps = 1e-1;
     float rate = 1e-1;
 
-    for (size_t i = 0; i < 500*1000; ++i) {
+    for (size_t i = 0; i < 10*1000; ++i) {
         float c = cost(w1, w2, b);
-        printf("w1 = %f, w2 = %f, b = %f, c = %f\n", w1, w2, b, c);
-        float dw1 = (cost(w1 + eps, w2, b) - c)/eps;
-        float dw2 = (cost(w1, w2 + eps, b) - c)/eps;
-        float db  = (cost(w1, w2, b + eps) - c)/eps;
+        printf("c = %f, w1 = %f, w2 = %f, b = %f\n", c, w1, w2, b);
+
+        float dw1, dw2, db;
+#if 1
+        float eps = 1e-1;
+        dcost(eps, w1, w2, b, &dw1, &dw2, &db);
+#else
+        gcost(w1, w2, b, &dw1, &dw2, &db);
+#endif
         w1 -= rate*dw1;
         w2 -= rate*dw2;
         b  -= rate*db;
     }
-    printf("w1 = %f, w2 = %f, b = %f, c = %f\n", w1, w2, b, cost(w1, w2, b));
+    printf("c = %f, w1 = %f, w2 = %f, b = %f\n", cost(w1, w2, b), w1, w2, b);
 
     for (size_t i = 0; i < 2; ++i) {
         for (size_t j = 0; j < 2; ++j) {

BIN
papers/grad.pdf


+ 143 - 0
papers/grad.tex

@@ -0,0 +1,143 @@
+\documentclass{article}
+
+\usepackage{amsmath}
+\usepackage{tikz}
+
+\begin{document}
+\section{Gradient Descent}
+
+\begin{align}
+  C'(w) = \lim_{\epsilon \to 0}\frac{C(w + \epsilon) - C(w)}{\epsilon}
+\end{align}
+
+\subsection{``Twice''}
+
+\begin{align}
+  C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\
+  C'(w)
+       &= \left(\frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2\right)' = \\
+       &= \frac{1}{n}\left(\sum_{i=1}^{n}(x_iw - y_i)^2\right)' \\
+       &= \frac{1}{n}\sum_{i=1}^{n}\left((x_iw - y_i)^2\right)' \\
+       &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
+\end{align}
+
+\begin{align}
+  C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\  
+  C'(w) &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
+\end{align}
+
+\subsection{One Neuron Model with 2 inputs}
+
+\def\d{2.0}
+
+\begin{center}
+  \begin{tikzpicture}
+    \node (X) at (-\d, 1) {$x$};
+    \node (Y) at (-\d, -1) {$y$};
+    \node[shape=circle,draw=black] (N) at (0, 0) {$\sigma, b$};
+    \node (Z) at (\d, 0) {$z$};
+    \path[->] (X) edge node[above] {$w_1$} (N);
+    \path[->] (Y) edge node[above] {$w_2$} (N);
+    \path[->] (N) edge (Z);
+  \end{tikzpicture}
+\end{center}
+\begin{align}
+  y &= \sigma(xw_1 + yw_2 + b) \\
+  \sigma(x) &= \frac{1}{1 + e^{-x}} \\
+  \sigma'(x) &= \sigma(x)(1 - \sigma(x)) \\
+\end{align}
+
+\subsubsection{Cost}
+
+\def\pd[#1]{\partial_{#1}}
+\def\avgsum[#1,#2]{\frac{1}{#2}\sum_{#1=1}^{#2}}
+\begin{align}
+  a_i &= \sigma(x_iw_1 + y_iw_2 + b) \\
+  \pd[w_1]a_i
+      &= \pd[w_1](\sigma(x_iw_1 + y_iw_2 + b)) = \\
+      &= a_i(1 - a_i)\pd[w_1](x_iw_1 + y_iw_2 + b) = \\
+      &= a_i(1 - a_i)x_i \\
+  \pd[w_2]a_i &= a_i(1 - a_i)y_i \\
+  \pd[b]a_i &= a_i(1 - a_i) \\
+  C &= \avgsum[i, n](a_i - z_i)^2 \\
+  \pd[w_1] C
+      &= \avgsum[i, n]\pd[w_1]\left((a_i - z_i)^2\right) = \\
+      &= \avgsum[i, n]2(a_i - z_i)\pd[w_1]a_i = \\
+      &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)x_i \\
+  \pd[w_2] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)y_i \\
+  \pd[b] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i) \\
+\end{align}
+
+\subsection{Two Neurons Model with 1 input}
+
+\begin{center}
+  \begin{tikzpicture}
+    \node (X) at (-\d, 0) {$x$};
+    \node[shape=circle,draw=black] (N1) at (0, 0) {$\sigma, b^{(1)}$};
+    \node[shape=circle,draw=black] (N2) at (\d, 0) {$\sigma, b^{(2)}$};
+    \node (Y) at ({2*\d}, 0) {$y$};
+    \path[->] (X) edge node[above] {$w^{(1)}$} (N1);
+    \path[->] (N1) edge node[above] {$w^{(2)}$} (N2);
+    \path[->] (N2) edge (Y);
+  \end{tikzpicture}
+\end{center}
+
+\begin{align}
+  a^{(1)} &= \sigma(xw^{(1)} + b^{(1)}) \\
+  y &= \sigma(a^{(1)}w^{(2)} + b^{(2)})
+\end{align}
+
+\subsubsection{Cost}
+
+\begin{align}
+  a_i^{(1)} &= \sigma(x_iw^{(1)} + b^{(1)}) \\
+  \pd[w^{1}]a_1^{(i)} &= a_i^{(1)}(1 - a_i^{(1)})x_i \\
+  \pd[b^{1}]a_1^{(i)} &= a_i^{(1)}(1 - a_i^{(1)}) \\
+  a_i^{(2)} &= \sigma(a_i^{(1)}w^{(2)} + b^{(2)}) \\
+  \pd[w^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
+  \pd[b^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)}) \\
+  \pd[a_i^{(1)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
+  C^{(2)} &= \avgsum[i, n] (a_i^{(2)} - y_i)^2 \\
+  \pd[w^{(2)}] C^{(2)}
+            &= \avgsum[i, n] \pd[w^{(2)}]((a_i^{(2)} - y_i)^2) = \\
+            &= \avgsum[i, n] 2(a_i^{(2)} - y_i)\pd[w^{(2)}]a_i^{(2)} = \\
+            &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
+  \pd[b^{(2)}] C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)}) \\
+  \pd[a_i^{(1)}]C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
+  e_i &= a_i^{(1)} - \pd[a_i^{(1)}]C^{(2)} \\
+  C^{(1)} &= \avgsum[i, n] (a_1^{(i)} - e_i)^2 \\
+  \pd[w^{1}]C^{(1)}
+            &= \pd[w^{1}]\left(\avgsum[i, n] (a_1^{(i)} - e_i)^2\right) =\\
+            &= \avgsum[i, n] \pd[w^{1}]\left((a_1^{(i)} - e_i)^2\right) =\\
+            &= \avgsum[i, n] 2(a_1^{(i)} - e_i)\pd[w^{1}]a_1^{(i)} =\\
+            &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})x_i \\
+  \pd[b^{1}]C^{(1)} &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)}) \\
+\end{align}
+
+\subsection{Arbitrary Neurons Model with 1 input}
+
+Let's assume that we have $m$ layers.
+
+\subsubsection{Feed-Forward}
+
+Let's assume that $a_i^{(0)}$ is $x_i$.
+
+\begin{align}
+  a_i^{(l)} &= \sigma(a_i^{(l-1)}w^{(l)} + b^{(l)}) \\
+  \pd[w^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} \\
+  \pd[b^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)}) \\
+  \pd[a_i^{(l-1)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
+\end{align}
+
+\subsubsection{Back-Propagation}
+
+Let's denote $a_i^{(m)} - y_i$ as $\pd[a_i^{(m)}]C^{(m+1)}$.
+
+\begin{align}
+  C^{(l)} &= \avgsum[i, n] (\pd[a_i^{(l)}]C^{(l+1)})^2 \\
+  \pd[w^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} =\\
+  \pd[b^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)}) \\
+  \pd[a_i^{(l-1)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
+\end{align}
+
+\end{document}

+ 32 - 16
twice.c

@@ -20,40 +20,56 @@ float rand_float(void)
 // w1, w2, w3, ...
 // y = x1*w1 + x2*w2 + x3*w3 + ... + b
 
-float cost(float w, float b)
+float cost(float w)
 {
     float result = 0.0f;
-    for (size_t i = 0; i < train_count; ++i) {
+    size_t n = train_count;
+    for (size_t i = 0; i < n; ++i) {
         float x = train[i][0];
-        float y = x*w + b;
+        float y = x*w;
         float d = y - train[i][1];
         result += d*d;
     }
-    result /= train_count;
+    result /= n;
+    return result;
+}
+
+float dcost(float w)
+{
+    float result = 0.0f;
+    size_t n = train_count;
+    for (size_t i = 0; i < n; ++i) {
+        float x = train[i][0];
+        float y = train[i][1];
+        result += 2*(x*w - y)*x;
+    }
+    result /= n;
     return result;
 }
 
 int main()
 {
-    srand(time(0));
+    // srand(time(0));
+    srand(69);
     float w = rand_float()*10.0f;
-    float b = rand_float()*5.0f;
 
-    float eps = 1e-3;
-    float rate = 1e-3;
+    float rate = 1e-1;
 
-    printf("%f\n", cost(w, b));
-    for (size_t i = 0; i < 500; ++i) {
-        float c = cost(w, b);
-        float dw = (cost(w + eps, b) - c)/eps;
-        float db = (cost(w, b + eps) - c)/eps;
+    printf("cost = %f, w = %f\n", cost(w), w);
+    for (size_t i = 0; i < 50; ++i) {
+#if 0
+        float eps = 1e-3;
+        float c = cost(w);
+        float dw = (cost(w + eps) - c)/eps;;
+#else
+        float dw = dcost(w);
+#endif
         w -= rate*dw;
-        b -= rate*db;
-        printf("cost = %f, w = %f, b = %f\n", cost(w, b), w, b);
+        printf("cost = %f, w = %f\n", cost(w), w);
     }
 
     printf("------------------------------\n");
-    printf("w = %f, b = %f\n", w, b);
+    printf("w = %f\n", w);
 
     return 0;
 }