{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# MATH7502 Project\n",
    "## Topic 4. Deep Learning   \n",
    "\n",
    "<br>  \n",
    "\n",
    "Group Member\n",
    "* Haomingxuan Chen 45585209\n",
    "* Yaowen Chang 45768262\n",
    "* Meng Li 45282393\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Header Files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "using LinearAlgebra\n",
    "using Random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## SGD Algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$ y = 3x_1 + 4x_2 $\n",
    "\n",
    "#### Iteration formula\n",
    "set \n",
    "\n",
    "$ \\alpha $ be the $ learning\\ rate $\n",
    "\n",
    "$ \\theta $ be the parameter waiting to optimize\n",
    "\n",
    "$ h_{\\theta}(x) = \\theta_0 + \\theta_1x_1+\\theta_2x_2 $\n",
    "\n",
    "we have\n",
    "\n",
    "$ \\theta_j = \\theta_j -\\alpha(h_{\\theta}(x^i) - y^i)x^i $"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the number of iterations: 838\n",
      "theta: [2.97945, 4.01252]\n",
      "trained formula: y = 2.98x_1 + 4.01x_2\n",
      "error: 9.645302679730802e-5\n"
     ]
    }
   ],
   "source": [
    "function sgd()\n",
    "\n",
    "    # training set\n",
    "    x = [1 1; 1 2; 1 3; 2 1; 2 2; 2 3; 2 4; 3 5]\n",
    "    c = [3; 4] # c is the coefficient\n",
    "    y = x * c # generate y\n",
    "\n",
    "    # initialization\n",
    "    m, d = size(x) # m is the number of the data turple, d is the number of x\n",
    "    theta = zeros(d) # parameter\n",
    "    alpha = 0.01 # learning rate\n",
    "    limit = 0.0001 # threshold of error for stopping the iteration\n",
    "    error = 0 # original error is zero\n",
    "    g = 0 # the gradient\n",
    "\n",
    "    n = 10000 # the number of iteration\n",
    "    point = 0 # pointer for the stopping location\n",
    "    for i in 1:n\n",
    "        j = i % m\n",
    "        if j == 0\n",
    "            j = m\n",
    "        end\n",
    "        \n",
    "        error = 1 / (2 * m) * (((x * theta) - y)' * ((x * theta) - y))\n",
    "        # stop the iteration\n",
    "        if abs(error) <= limit\n",
    "            point = i\n",
    "            break\n",
    "        end\n",
    "\n",
    "        # maintain the theta\n",
    "        g = x[j, :] * ((x[j, :]' * theta) - y[j])\n",
    "        theta -= alpha * g\n",
    "    end\n",
    "\n",
    "    println(\"the number of iterations: \", point + 1)\n",
    "    println(\"theta: \",theta)\n",
    "    println(\"trained formula: \", \"y = \", round(theta[1], digits = 2), \"x_1 + \", round(theta[2], digits = 2), \"x_2\")\n",
    "    println(\"error: \",error)\n",
    "end\n",
    "\n",
    "sgd()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Adam Algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "$ y = 3x_1 + 4x_2 $\n",
    "\n",
    "#### Iteration formula\n",
    "\n",
    "$ g=(h_{\\theta}(x^i)-y^i)x^i $\n",
    "\n",
    "$ m_t = \\beta_1 m_{t-1} + (1 - \\beta_1) \\times g $\n",
    "\n",
    "$ v_t = \\beta_2 v_{t-1} + (1 - \\beta_2) \\times g^2 $\n",
    "\n",
    "$ \\hat{m_t} = \\frac{m_t}{1-\\beta_1^t} $\n",
    "\n",
    "$ \\hat{v_t} = \\frac{v_t}{1-\\beta_2^t} $\n",
    "\n",
    "$ \\theta_j = \\theta_j - \\hat{m_t}\\times\\frac{\\alpha}{\\sqrt{\\hat{v_t}}+\\epsilon} $"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "the number of iterations: 5169\n",
      "theta: [3.02072, 3.98696]\n",
      "trained formula: y = 3.02x_1 + 3.99x_2\n",
      "error: 9.984814874907856e-5\n"
     ]
    }
   ],
   "source": [
    "function Adam()\n",
    "    \n",
    "    # training set\n",
    "    x = [1 1; 1 2; 1 3; 2 1; 2 2; 2 3; 2 4; 3 5]\n",
    "    c = [3; 4] # c is the coefficient\n",
    "    y = x * c # generate y\n",
    "    \n",
    "    # initialization\n",
    "    m, d = size(x) # m is the number of the data turple, d is the number of x\n",
    "    theta = zeros(d) # parameter\n",
    "    \n",
    "    alpha = 0.01 # learning rate\n",
    "    momentum = 0.1 # the momentum\n",
    "    limit = 0.0001 # threshold of error for stopping the iteration\n",
    "    error = 0 # original error is zero\n",
    "    g = 0 # the gradient\n",
    "    \n",
    "    b1 = 0.9 # default parameter\n",
    "    b2 = 0.999 # default parameter\n",
    "    eps = 0.00000001 # default parameter\n",
    "    mt = zeros(d)\n",
    "    vt = zeros(d)\n",
    "    \n",
    "    n = 10000 # the number of iteration\n",
    "    point = 0 # pointer for the stopping location\n",
    "    for i in 1:n\n",
    "        j = i % m\n",
    "        if j == 0\n",
    "            j = m\n",
    "        end\n",
    "    \n",
    "        error = 1 / (2 * m) * (((x * theta) - y)' * ((x * theta) - y))\n",
    "        if abs(error) <= limit\n",
    "            point = i\n",
    "            break\n",
    "        end\n",
    "        \n",
    "        g = x[j, :] * ((x[j, :]' * theta) - y[j])\n",
    "        \n",
    "        mt = b1 * mt + (1 - b1) * g\n",
    "        vt = b2 * vt + (1 - b2) * (g .^ 2)\n",
    "\n",
    "        mtt = mt / (1 - (b1^(i + 1)))\n",
    "        vtt = vt / (1 - (b2^(i + 1)))\n",
    "\n",
    "        # maintain the theta\n",
    "        theta -= alpha * mtt ./ ([sqrt(vtt[1]); sqrt(vtt[2])] .+ eps)\n",
    "    end\n",
    "    \n",
    "    println(\"the number of iterations: \", point + 1)\n",
    "    println(\"theta: \",theta)\n",
    "    println(\"trained formula: \", \"y = \", round(theta[1], digits = 2), \"x_1 + \", round(theta[2], digits = 2), \"x_2\")\n",
    "    println(\"error: \",error)\n",
    "end\n",
    "Adam()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## BackPropagate Algorithm"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "#### Input $ x $\n",
    "set the corresponding activation $ a^1 $ for the input layer\n",
    "\n",
    "#### Activation Function $ Sigmoid $\n",
    "$ f(x)= \\frac{1}{1+e^{-x}} $\n",
    "\n",
    "#### Feedforward\n",
    "for each $ l = 2,3,\\ldots,L $ compute $ z^l = w^la^{l-1} + b^l $ and $ a^l = \\sigma(z^l) $\n",
    "\n",
    "#### Output error $ \\delta^L $\n",
    "compute the vector $ \\delta^L = \\nabla_aC \\times \\sigma'(z^l) $\n",
    "\n",
    "#### BackPropagate the error\n",
    "for each $ l = L-1, L-2, \\ldots, 2 $ compute \\delta^L = ((w^{l+1}^T\\delta^{l+1}) \\times \\sigma'(z^l)\n",
    "\n",
    "#### Output\n",
    "the gradient of the cost function is given by $ \\frac{\\partial C}{\\partial w^l_{jk}} = a_k^{l-1}\\delta_j^l $ and $ \\frac{\\partial C}{\\partial b_j^l} = \\delta_j^l $"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "error 0.04349112077113367\n",
      "error 0.025942003856079927\n",
      "error 0.019702164630193773\n",
      "error 0.01630461635488493\n",
      "error 0.014106366357671821\n",
      "error 0.012542928023103181\n",
      "error 0.01136224873105968\n",
      "error 0.010432802357584628\n",
      "error 0.009678421305883379\n",
      "error 0.009051576819171896\n",
      "--------------------------\n",
      "the prediction of [0 1 1] is 1\n"
     ]
    }
   ],
   "source": [
    "# single hidden layer neural networks using backpropagate algorithm to train the model\n",
    "\n",
    "# the average of abs(a)\n",
    "function mean_abs(a)\n",
    "    temp_sum = 0\n",
    "    temp_num = 0\n",
    "    for i in a\n",
    "        temp_sum += abs(i)\n",
    "        temp_num += 1\n",
    "    end\n",
    "    return temp_sum / temp_num\n",
    "end\n",
    "\n",
    "# use sigmoid as activation function\n",
    "function sigmoid(x, deriv)\n",
    "    if deriv == true\n",
    "        x = (1 .- x) .* x\n",
    "    else\n",
    "        e = 2.718281828459\n",
    "        x = 1 ./ (1 .+ e .^(-x))\n",
    "        return x\n",
    "    end\n",
    "end\n",
    "\n",
    "# train a model\n",
    "function train(x, y)\n",
    "    \n",
    "    # using random as initial weights\n",
    "    m, d = size(x)\n",
    "    w0 = 2 * rand(d, m) - ones(d, m)\n",
    "    w1 = 2 * rand(m, 1) - ones(m, 1)\n",
    "\n",
    "    n = 10000 # number of iteration\n",
    "    for i in 1:n\n",
    "    \n",
    "        # feedforward\n",
    "        l0 = x\n",
    "        l1 = sigmoid(l0 * w0, false)\n",
    "        l2 = sigmoid(l1 * w1, false)\n",
    "        l2_error = -(y - l2)\n",
    "    \n",
    "        if (i % 1000 == 0)       \n",
    "            println(\"error \", mean_abs(l2_error))\n",
    "        end\n",
    "    \n",
    "        # backpropagate\n",
    "        l2_delta = l2_error .* sigmoid(l2, true) # error of output_layer    \n",
    "        pd_h2o = l1' * l2_delta # partial derivative from hidden_layer to output_layer\n",
    "\n",
    "        l1_error = l2_delta * w1'\n",
    "        l1_delta = l1_error .* sigmoid(l1, true) # error of hidden_layer\n",
    "        pd_i2h = l0' * l1_delta # partial derivative from input_layer to hidden_layer\n",
    " \n",
    "        w1 -= pd_h2o\n",
    "        w0 -= pd_i2h\n",
    "    end \n",
    "    return (w0, w1)\n",
    "end\n",
    "\n",
    "# use trained model to test the result\n",
    "function predict(x, w0, w1)\n",
    "    l1 = sigmoid(x * w0, false)\n",
    "    l2 = sigmoid(l1 * w1, false)\n",
    "    if l2[1] >= 0.5\n",
    "        println(\"the prediction of \", x, \" is 1\")\n",
    "    else\n",
    "        println(\"the prediction of \", x, \" is 0\")\n",
    "    end\n",
    "end\n",
    "\n",
    "\n",
    "# training set\n",
    "x = [0 0 1; 0 1 1; 1 0 1; 1 1 1; 0 0 1]\n",
    "y = [0; 1; 1; 0; 0]\n",
    "\n",
    "# train the model\n",
    "w0, w1 = train(x, y)\n",
    "\n",
    "println(\"--------------------------\")\n",
    "\n",
    "# test the model\n",
    "predict([0 1 1], w0, w1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Julia 1.0.3 (4 threads)",
   "language": "julia",
   "name": "julia-1.0k"
  },
  "language_info": {
   "file_extension": ".jl",
   "mimetype": "application/julia",
   "name": "julia",
   "version": "1.0.3"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}