2 жил өмнө · 8fe7f5a530
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,3 @@
 
				-build/
			
 
				+build/
			
 
				+*.aux
			
 
				+*.log
			
--- a/gates.c
+++ b/gates.c
@@ -39,7 +39,7 @@ sample xor_train[] = {
 
				     {1, 1, 0},
			
 
				 };
			
 
				 
			
 
				-sample *train = xor_train;
			
 
				+sample *train = and_train;
			
 
				 size_t train_count = 4;
			
 
				 
			
 
				 float cost(float w1, float w2, float b)
			
@@ -56,6 +56,39 @@ float cost(float w1, float w2, float b)
 
				     return result;
			
 
				 }
			
 
				 
			
 
				+void dcost(float eps,
			
 
				+           float w1, float w2, float b,
			
 
				+           float *dw1, float *dw2, float *db)
			
 
				+{
			
 
				+    float c = cost(w1, w2, b);
			
 
				+    *dw1 = (cost(w1 + eps, w2, b) - c)/eps;
			
 
				+    *dw2 = (cost(w1, w2 + eps, b) - c)/eps;
			
 
				+    *db  = (cost(w1, w2, b + eps) - c)/eps;
			
 
				+}
			
 
				+
			
 
				+
			
 
				+void gcost(float w1, float w2, float b,
			
 
				+           float *dw1, float *dw2, float *db)
			
 
				+{
			
 
				+    *dw1 = 0;
			
 
				+    *dw2 = 0;
			
 
				+    *db = 0;
			
 
				+    size_t n = train_count;
			
 
				+    for (size_t i = 0; i < n; ++i) {
			
 
				+        float xi = train[i][0];
			
 
				+        float yi = train[i][1];
			
 
				+        float zi = train[i][2];
			
 
				+        float ai = sigmoidf(xi*w1 + yi*w2 + b);
			
 
				+        float di = 2*(ai - zi)*ai*(1 - ai);
			
 
				+        *dw1 += di*xi;
			
 
				+        *dw2 += di*yi;
			
 
				+        *db  += di;
			
 
				+    }
			
 
				+    *dw1 /= n;
			
 
				+    *dw2 /= n;
			
 
				+    *db /= n;
			
 
				+}
			
 
				+
			
 
				 float rand_float(void)
			
 
				 {
			
 
				     return (float) rand()/ (float) RAND_MAX;
			
@@ -79,20 +112,24 @@ int main(void)
 
				     float w2 = rand_float();
			
 
				     float b  = rand_float();
			
 
				 
			
 
				-    float eps = 1e-1;
			
 
				     float rate = 1e-1;
			
 
				 
			
 
				-    for (size_t i = 0; i < 500*1000; ++i) {
			
 
				+    for (size_t i = 0; i < 10*1000; ++i) {
			
 
				         float c = cost(w1, w2, b);
			
 
				-        printf("w1 = %f, w2 = %f, b = %f, c = %f\n", w1, w2, b, c);
			
 
				-        float dw1 = (cost(w1 + eps, w2, b) - c)/eps;
			
 
				-        float dw2 = (cost(w1, w2 + eps, b) - c)/eps;
			
 
				-        float db  = (cost(w1, w2, b + eps) - c)/eps;
			
 
				+        printf("c = %f, w1 = %f, w2 = %f, b = %f\n", c, w1, w2, b);
			
 
				+
			
 
				+        float dw1, dw2, db;
			
 
				+#if 1
			
 
				+        float eps = 1e-1;
			
 
				+        dcost(eps, w1, w2, b, &dw1, &dw2, &db);
			
 
				+#else
			
 
				+        gcost(w1, w2, b, &dw1, &dw2, &db);
			
 
				+#endif
			
 
				         w1 -= rate*dw1;
			
 
				         w2 -= rate*dw2;
			
 
				         b  -= rate*db;
			
 
				     }
			
 
				-    printf("w1 = %f, w2 = %f, b = %f, c = %f\n", w1, w2, b, cost(w1, w2, b));
			
 
				+    printf("c = %f, w1 = %f, w2 = %f, b = %f\n", cost(w1, w2, b), w1, w2, b);
			
 
				 
			
 
				     for (size_t i = 0; i < 2; ++i) {
			
 
				         for (size_t j = 0; j < 2; ++j) {
			
--- a/papers/grad.pdf
+++ b/papers/grad.pdf
--- a/papers/grad.tex
+++ b/papers/grad.tex
@@ -0,0 +1,143 @@
 
				+\documentclass{article}
			
 
				+
			
 
				+\usepackage{amsmath}
			
 
				+\usepackage{tikz}
			
 
				+
			
 
				+\begin{document}
			
 
				+\section{Gradient Descent}
			
 
				+
			
 
				+\begin{align}
			
 
				+  C'(w) = \lim_{\epsilon \to 0}\frac{C(w + \epsilon) - C(w)}{\epsilon}
			
 
				+\end{align}
			
 
				+
			
 
				+\subsection{``Twice''}
			
 
				+
			
 
				+\begin{align}
			
 
				+  C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\
			
 
				+  C'(w)
			
 
				+       &= \left(\frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2\right)' = \\
			
 
				+       &= \frac{1}{n}\left(\sum_{i=1}^{n}(x_iw - y_i)^2\right)' \\
			
 
				+       &= \frac{1}{n}\sum_{i=1}^{n}\left((x_iw - y_i)^2\right)' \\
			
 
				+       &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
			
 
				+\end{align}
			
 
				+
			
 
				+\begin{align}
			
 
				+  C(w) &= \frac{1}{n}\sum_{i=1}^{n}(x_iw - y_i)^2 \\  
			
 
				+  C'(w) &= \frac{1}{n}\sum_{i=1}^{n}2(x_iw - y_i)x_i \\
			
 
				+\end{align}
			
 
				+
			
 
				+\subsection{One Neuron Model with 2 inputs}
			
 
				+
			
 
				+\def\d{2.0}
			
 
				+
			
 
				+\begin{center}
			
 
				+  \begin{tikzpicture}
			
 
				+    \node (X) at (-\d, 1) {$x$};
			
 
				+    \node (Y) at (-\d, -1) {$y$};
			
 
				+    \node[shape=circle,draw=black] (N) at (0, 0) {$\sigma, b$};
			
 
				+    \node (Z) at (\d, 0) {$z$};
			
 
				+    \path[->] (X) edge node[above] {$w_1$} (N);
			
 
				+    \path[->] (Y) edge node[above] {$w_2$} (N);
			
 
				+    \path[->] (N) edge (Z);
			
 
				+  \end{tikzpicture}
			
 
				+\end{center}
			
 
				+\begin{align}
			
 
				+  y &= \sigma(xw_1 + yw_2 + b) \\
			
 
				+  \sigma(x) &= \frac{1}{1 + e^{-x}} \\
			
 
				+  \sigma'(x) &= \sigma(x)(1 - \sigma(x)) \\
			
 
				+\end{align}
			
 
				+
			
 
				+\subsubsection{Cost}
			
 
				+
			
 
				+\def\pd[#1]{\partial_{#1}}
			
 
				+\def\avgsum[#1,#2]{\frac{1}{#2}\sum_{#1=1}^{#2}}
			
 
				+\begin{align}
			
 
				+  a_i &= \sigma(x_iw_1 + y_iw_2 + b) \\
			
 
				+  \pd[w_1]a_i
			
 
				+      &= \pd[w_1](\sigma(x_iw_1 + y_iw_2 + b)) = \\
			
 
				+      &= a_i(1 - a_i)\pd[w_1](x_iw_1 + y_iw_2 + b) = \\
			
 
				+      &= a_i(1 - a_i)x_i \\
			
 
				+  \pd[w_2]a_i &= a_i(1 - a_i)y_i \\
			
 
				+  \pd[b]a_i &= a_i(1 - a_i) \\
			
 
				+  C &= \avgsum[i, n](a_i - z_i)^2 \\
			
 
				+  \pd[w_1] C
			
 
				+      &= \avgsum[i, n]\pd[w_1]\left((a_i - z_i)^2\right) = \\
			
 
				+      &= \avgsum[i, n]2(a_i - z_i)\pd[w_1]a_i = \\
			
 
				+      &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)x_i \\
			
 
				+  \pd[w_2] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i)y_i \\
			
 
				+  \pd[b] C &= \avgsum[i, n]2(a_i - z_i)a_i(1 - a_i) \\
			
 
				+\end{align}
			
 
				+
			
 
				+\subsection{Two Neurons Model with 1 input}
			
 
				+
			
 
				+\begin{center}
			
 
				+  \begin{tikzpicture}
			
 
				+    \node (X) at (-\d, 0) {$x$};
			
 
				+    \node[shape=circle,draw=black] (N1) at (0, 0) {$\sigma, b^{(1)}$};
			
 
				+    \node[shape=circle,draw=black] (N2) at (\d, 0) {$\sigma, b^{(2)}$};
			
 
				+    \node (Y) at ({2*\d}, 0) {$y$};
			
 
				+    \path[->] (X) edge node[above] {$w^{(1)}$} (N1);
			
 
				+    \path[->] (N1) edge node[above] {$w^{(2)}$} (N2);
			
 
				+    \path[->] (N2) edge (Y);
			
 
				+  \end{tikzpicture}
			
 
				+\end{center}
			
 
				+
			
 
				+\begin{align}
			
 
				+  a^{(1)} &= \sigma(xw^{(1)} + b^{(1)}) \\
			
 
				+  y &= \sigma(a^{(1)}w^{(2)} + b^{(2)})
			
 
				+\end{align}
			
 
				+
			
 
				+\subsubsection{Cost}
			
 
				+
			
 
				+\begin{align}
			
 
				+  a_i^{(1)} &= \sigma(x_iw^{(1)} + b^{(1)}) \\
			
 
				+  \pd[w^{1}]a_1^{(i)} &= a_i^{(1)}(1 - a_i^{(1)})x_i \\
			
 
				+  \pd[b^{1}]a_1^{(i)} &= a_i^{(1)}(1 - a_i^{(1)}) \\
			
 
				+  a_i^{(2)} &= \sigma(a_i^{(1)}w^{(2)} + b^{(2)}) \\
			
 
				+  \pd[w^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
			
 
				+  \pd[b^{(2)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)}) \\
			
 
				+  \pd[a_i^{(1)}]a_i^{(2)} &= a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
			
 
				+  C^{(2)} &= \avgsum[i, n] (a_i^{(2)} - y_i)^2 \\
			
 
				+  \pd[w^{(2)}] C^{(2)}
			
 
				+            &= \avgsum[i, n] \pd[w^{(2)}]((a_i^{(2)} - y_i)^2) = \\
			
 
				+            &= \avgsum[i, n] 2(a_i^{(2)} - y_i)\pd[w^{(2)}]a_i^{(2)} = \\
			
 
				+            &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})a_i^{(1)} \\
			
 
				+  \pd[b^{(2)}] C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)}) \\
			
 
				+  \pd[a_i^{(1)}]C^{(2)} &= \avgsum[i, n] 2(a_i^{(2)} - y_i)a_i^{(2)}(1 - a_i^{(2)})w^{(2)} \\
			
 
				+  e_i &= a_i^{(1)} - \pd[a_i^{(1)}]C^{(2)} \\
			
 
				+  C^{(1)} &= \avgsum[i, n] (a_1^{(i)} - e_i)^2 \\
			
 
				+  \pd[w^{1}]C^{(1)}
			
 
				+            &= \pd[w^{1}]\left(\avgsum[i, n] (a_1^{(i)} - e_i)^2\right) =\\
			
 
				+            &= \avgsum[i, n] \pd[w^{1}]\left((a_1^{(i)} - e_i)^2\right) =\\
			
 
				+            &= \avgsum[i, n] 2(a_1^{(i)} - e_i)\pd[w^{1}]a_1^{(i)} =\\
			
 
				+            &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)})x_i \\
			
 
				+  \pd[b^{1}]C^{(1)} &= \avgsum[i, n] 2(\pd[a_i^{(1)}]C^{(2)}) \\
			
 
				+\end{align}
			
 
				+
			
 
				+\subsection{Arbitrary Neurons Model with 1 input}
			
 
				+
			
 
				+Let's assume that we have $m$ layers.
			
 
				+
			
 
				+\subsubsection{Feed-Forward}
			
 
				+
			
 
				+Let's assume that $a_i^{(0)}$ is $x_i$.
			
 
				+
			
 
				+\begin{align}
			
 
				+  a_i^{(l)} &= \sigma(a_i^{(l-1)}w^{(l)} + b^{(l)}) \\
			
 
				+  \pd[w^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} \\
			
 
				+  \pd[b^{(l)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)}) \\
			
 
				+  \pd[a_i^{(l-1)}]a_i^{(l)} &= a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
			
 
				+\end{align}
			
 
				+
			
 
				+\subsubsection{Back-Propagation}
			
 
				+
			
 
				+Let's denote $a_i^{(m)} - y_i$ as $\pd[a_i^{(m)}]C^{(m+1)}$.
			
 
				+
			
 
				+\begin{align}
			
 
				+  C^{(l)} &= \avgsum[i, n] (\pd[a_i^{(l)}]C^{(l+1)})^2 \\
			
 
				+  \pd[w^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})a_i^{(l-1)} =\\
			
 
				+  \pd[b^{(l)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)}) \\
			
 
				+  \pd[a_i^{(l-1)}]C^{(l)} &= \avgsum[i, n] 2(\pd[a_i^{(l)}]C^{(l+1)})a_i^{(l)}(1 - a_i^{(l)})w^{(l)} \\
			
 
				+\end{align}
			
 
				+
			
 
				+\end{document}
			
--- a/twice.c
+++ b/twice.c
@@ -20,40 +20,56 @@ float rand_float(void)
 
				 // w1, w2, w3, ...
			
 
				 // y = x1*w1 + x2*w2 + x3*w3 + ... + b
			
 
				 
			
 
				-float cost(float w, float b)
			
 
				+float cost(float w)
			
 
				 {
			
 
				     float result = 0.0f;
			
 
				-    for (size_t i = 0; i < train_count; ++i) {
			
 
				+    size_t n = train_count;
			
 
				+    for (size_t i = 0; i < n; ++i) {
			
 
				         float x = train[i][0];
			
 
				-        float y = x*w + b;
			
 
				+        float y = x*w;
			
 
				         float d = y - train[i][1];
			
 
				         result += d*d;
			
 
				     }
			
 
				-    result /= train_count;
			
 
				+    result /= n;
			
 
				+    return result;
			
 
				+}
			
 
				+
			
 
				+float dcost(float w)
			
 
				+{
			
 
				+    float result = 0.0f;
			
 
				+    size_t n = train_count;
			
 
				+    for (size_t i = 0; i < n; ++i) {
			
 
				+        float x = train[i][0];
			
 
				+        float y = train[i][1];
			
 
				+        result += 2*(x*w - y)*x;
			
 
				+    }
			
 
				+    result /= n;
			
 
				     return result;
			
 
				 }
			
 
				 
			
 
				 int main()
			
 
				 {
			
 
				-    srand(time(0));
			
 
				+    // srand(time(0));
			
 
				+    srand(69);
			
 
				     float w = rand_float()*10.0f;
			
 
				-    float b = rand_float()*5.0f;
			
 
				 
			
 
				-    float eps = 1e-3;
			
 
				-    float rate = 1e-3;
			
 
				+    float rate = 1e-1;
			
 
				 
			
 
				-    printf("%f\n", cost(w, b));
			
 
				-    for (size_t i = 0; i < 500; ++i) {
			
 
				-        float c = cost(w, b);
			
 
				-        float dw = (cost(w + eps, b) - c)/eps;
			
 
				-        float db = (cost(w, b + eps) - c)/eps;
			
 
				+    printf("cost = %f, w = %f\n", cost(w), w);
			
 
				+    for (size_t i = 0; i < 50; ++i) {
			
 
				+#if 0
			
 
				+        float eps = 1e-3;
			
 
				+        float c = cost(w);
			
 
				+        float dw = (cost(w + eps) - c)/eps;;
			
 
				+#else
			
 
				+        float dw = dcost(w);
			
 
				+#endif
			
 
				         w -= rate*dw;
			
 
				-        b -= rate*db;
			
 
				-        printf("cost = %f, w = %f, b = %f\n", cost(w, b), w, b);
			
 
				+        printf("cost = %f, w = %f\n", cost(w), w);
			
 
				     }
			
 
				 
			
 
				     printf("------------------------------\n");
			
 
				-    printf("w = %f, b = %f\n", w, b);
			
 
				+    printf("w = %f\n", w);
			
 
				 
			
 
				     return 0;
			
 
				 }