{ "cells": [ { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# MATH7502 Project\n", "## Topic 4. Deep Learning \n", "\n", "
\n", "\n", "Group Member\n", "* Haomingxuan Chen 45585209\n", "* Yaowen Chang 45768262\n", "* Meng Li 45282393\n", "\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Header Files" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [], "source": [ "using LinearAlgebra\n", "using Random" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## SGD Algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$ y = 3x_1 + 4x_2 $\n", "\n", "#### Iteration formula\n", "set \n", "\n", "$ \\alpha $ be the $ learning\\ rate $\n", "\n", "$ \\theta $ be the parameter waiting to optimize\n", "\n", "$ h_{\\theta}(x) = \\theta_0 + \\theta_1x_1+\\theta_2x_2 $\n", "\n", "we have\n", "\n", "$ \\theta_j = \\theta_j -\\alpha(h_{\\theta}(x^i) - y^i)x^i $" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the number of iterations: 838\n", "theta: [2.97945, 4.01252]\n", "trained formula: y = 2.98x_1 + 4.01x_2\n", "error: 9.645302679730802e-5\n" ] } ], "source": [ "function sgd()\n", "\n", " # training set\n", " x = [1 1; 1 2; 1 3; 2 1; 2 2; 2 3; 2 4; 3 5]\n", " c = [3; 4] # c is the coefficient\n", " y = x * c # generate y\n", "\n", " # initialization\n", " m, d = size(x) # m is the number of the data turple, d is the number of x\n", " theta = zeros(d) # parameter\n", " alpha = 0.01 # learning rate\n", " limit = 0.0001 # threshold of error for stopping the iteration\n", " error = 0 # original error is zero\n", " g = 0 # the gradient\n", "\n", " n = 10000 # the number of iteration\n", " point = 0 # pointer for the stopping location\n", " for i in 1:n\n", " j = i % m\n", " if j == 0\n", " j = m\n", " end\n", " \n", " error = 1 / (2 * m) * (((x * theta) - y)' * ((x * theta) - y))\n", " # stop the iteration\n", " if abs(error) <= limit\n", " point = i\n", " break\n", " end\n", "\n", " # maintain the theta\n", " g = x[j, :] * ((x[j, :]' * theta) - y[j])\n", " theta -= alpha * g\n", " end\n", "\n", " println(\"the number of iterations: \", point + 1)\n", " println(\"theta: \",theta)\n", " println(\"trained formula: \", \"y = \", round(theta[1], digits = 2), \"x_1 + \", round(theta[2], digits = 2), \"x_2\")\n", " println(\"error: \",error)\n", "end\n", "\n", "sgd()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Adam Algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "$ y = 3x_1 + 4x_2 $\n", "\n", "#### Iteration formula\n", "\n", "$ g=(h_{\\theta}(x^i)-y^i)x^i $\n", "\n", "$ m_t = \\beta_1 m_{t-1} + (1 - \\beta_1) \\times g $\n", "\n", "$ v_t = \\beta_2 v_{t-1} + (1 - \\beta_2) \\times g^2 $\n", "\n", "$ \\hat{m_t} = \\frac{m_t}{1-\\beta_1^t} $\n", "\n", "$ \\hat{v_t} = \\frac{v_t}{1-\\beta_2^t} $\n", "\n", "$ \\theta_j = \\theta_j - \\hat{m_t}\\times\\frac{\\alpha}{\\sqrt{\\hat{v_t}}+\\epsilon} $" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "the number of iterations: 5169\n", "theta: [3.02072, 3.98696]\n", "trained formula: y = 3.02x_1 + 3.99x_2\n", "error: 9.984814874907856e-5\n" ] } ], "source": [ "function Adam()\n", " \n", " # training set\n", " x = [1 1; 1 2; 1 3; 2 1; 2 2; 2 3; 2 4; 3 5]\n", " c = [3; 4] # c is the coefficient\n", " y = x * c # generate y\n", " \n", " # initialization\n", " m, d = size(x) # m is the number of the data turple, d is the number of x\n", " theta = zeros(d) # parameter\n", " \n", " alpha = 0.01 # learning rate\n", " momentum = 0.1 # the momentum\n", " limit = 0.0001 # threshold of error for stopping the iteration\n", " error = 0 # original error is zero\n", " g = 0 # the gradient\n", " \n", " b1 = 0.9 # default parameter\n", " b2 = 0.999 # default parameter\n", " eps = 0.00000001 # default parameter\n", " mt = zeros(d)\n", " vt = zeros(d)\n", " \n", " n = 10000 # the number of iteration\n", " point = 0 # pointer for the stopping location\n", " for i in 1:n\n", " j = i % m\n", " if j == 0\n", " j = m\n", " end\n", " \n", " error = 1 / (2 * m) * (((x * theta) - y)' * ((x * theta) - y))\n", " if abs(error) <= limit\n", " point = i\n", " break\n", " end\n", " \n", " g = x[j, :] * ((x[j, :]' * theta) - y[j])\n", " \n", " mt = b1 * mt + (1 - b1) * g\n", " vt = b2 * vt + (1 - b2) * (g .^ 2)\n", "\n", " mtt = mt / (1 - (b1^(i + 1)))\n", " vtt = vt / (1 - (b2^(i + 1)))\n", "\n", " # maintain the theta\n", " theta -= alpha * mtt ./ ([sqrt(vtt[1]); sqrt(vtt[2])] .+ eps)\n", " end\n", " \n", " println(\"the number of iterations: \", point + 1)\n", " println(\"theta: \",theta)\n", " println(\"trained formula: \", \"y = \", round(theta[1], digits = 2), \"x_1 + \", round(theta[2], digits = 2), \"x_2\")\n", " println(\"error: \",error)\n", "end\n", "Adam()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## BackPropagate Algorithm" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "#### Input $ x $\n", "set the corresponding activation $ a^1 $ for the input layer\n", "\n", "#### Activation Function $ Sigmoid $\n", "$ f(x)= \\frac{1}{1+e^{-x}} $\n", "\n", "#### Feedforward\n", "for each $ l = 2,3,\\ldots,L $ compute $ z^l = w^la^{l-1} + b^l $ and $ a^l = \\sigma(z^l) $\n", "\n", "#### Output error $ \\delta^L $\n", "compute the vector $ \\delta^L = \\nabla_aC \\times \\sigma'(z^l) $\n", "\n", "#### BackPropagate the error\n", "for each $ l = L-1, L-2, \\ldots, 2 $ compute \\delta^L = ((w^{l+1}^T\\delta^{l+1}) \\times \\sigma'(z^l)\n", "\n", "#### Output\n", "the gradient of the cost function is given by $ \\frac{\\partial C}{\\partial w^l_{jk}} = a_k^{l-1}\\delta_j^l $ and $ \\frac{\\partial C}{\\partial b_j^l} = \\delta_j^l $" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "error 0.04349112077113367\n", "error 0.025942003856079927\n", "error 0.019702164630193773\n", "error 0.01630461635488493\n", "error 0.014106366357671821\n", "error 0.012542928023103181\n", "error 0.01136224873105968\n", "error 0.010432802357584628\n", "error 0.009678421305883379\n", "error 0.009051576819171896\n", "--------------------------\n", "the prediction of [0 1 1] is 1\n" ] } ], "source": [ "# single hidden layer neural networks using backpropagate algorithm to train the model\n", "\n", "# the average of abs(a)\n", "function mean_abs(a)\n", " temp_sum = 0\n", " temp_num = 0\n", " for i in a\n", " temp_sum += abs(i)\n", " temp_num += 1\n", " end\n", " return temp_sum / temp_num\n", "end\n", "\n", "# use sigmoid as activation function\n", "function sigmoid(x, deriv)\n", " if deriv == true\n", " x = (1 .- x) .* x\n", " else\n", " e = 2.718281828459\n", " x = 1 ./ (1 .+ e .^(-x))\n", " return x\n", " end\n", "end\n", "\n", "# train a model\n", "function train(x, y)\n", " \n", " # using random as initial weights\n", " m, d = size(x)\n", " w0 = 2 * rand(d, m) - ones(d, m)\n", " w1 = 2 * rand(m, 1) - ones(m, 1)\n", "\n", " n = 10000 # number of iteration\n", " for i in 1:n\n", " \n", " # feedforward\n", " l0 = x\n", " l1 = sigmoid(l0 * w0, false)\n", " l2 = sigmoid(l1 * w1, false)\n", " l2_error = -(y - l2)\n", " \n", " if (i % 1000 == 0) \n", " println(\"error \", mean_abs(l2_error))\n", " end\n", " \n", " # backpropagate\n", " l2_delta = l2_error .* sigmoid(l2, true) # error of output_layer \n", " pd_h2o = l1' * l2_delta # partial derivative from hidden_layer to output_layer\n", "\n", " l1_error = l2_delta * w1'\n", " l1_delta = l1_error .* sigmoid(l1, true) # error of hidden_layer\n", " pd_i2h = l0' * l1_delta # partial derivative from input_layer to hidden_layer\n", " \n", " w1 -= pd_h2o\n", " w0 -= pd_i2h\n", " end \n", " return (w0, w1)\n", "end\n", "\n", "# use trained model to test the result\n", "function predict(x, w0, w1)\n", " l1 = sigmoid(x * w0, false)\n", " l2 = sigmoid(l1 * w1, false)\n", " if l2[1] >= 0.5\n", " println(\"the prediction of \", x, \" is 1\")\n", " else\n", " println(\"the prediction of \", x, \" is 0\")\n", " end\n", "end\n", "\n", "\n", "# training set\n", "x = [0 0 1; 0 1 1; 1 0 1; 1 1 1; 0 0 1]\n", "y = [0; 1; 1; 0; 0]\n", "\n", "# train the model\n", "w0, w1 = train(x, y)\n", "\n", "println(\"--------------------------\")\n", "\n", "# test the model\n", "predict([0 1 1], w0, w1)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Julia 1.0.3 (4 threads)", "language": "julia", "name": "julia-1.0k" }, "language_info": { "file_extension": ".jl", "mimetype": "application/julia", "name": "julia", "version": "1.0.3" } }, "nbformat": 4, "nbformat_minor": 2 }