#19oct18 after Andrew Trask's XOR 1 hidden layer code (modified by GF) 
#17oct18 rationalize variable names notation  
#14oct18 continue for MA490 lecture 19oct18 
#12mar18 Once more with feeling from Andrew Trask 
import numpy as np ## C-library for array operations
np.set_printoptions(formatter={'float': lambda x: "{0:0.3f}".format(x)})
## round printouts to 3 decimal places
X0 = np.array([[0,0,1],[0,1,1],[1,0,1],[1,1,1]]) ## X0(4,3)=4rows of 3vcrs
y0 = np.array([[0,1,1,0]]).T ## XOR, y0(4,1) column vector
#y0 = np.array([[0,1,1,1]]).T ## OR, y0(4,1) column vector
#y0 = np.array([[0,0,0,1]]).T ## AND, y0(4,1) column vector
#y0 = np.array([[1,0,0,1]]).T ## IFF, y0(4,1) column vector
#y0 = np.array([[1,0,1,1]]).T ## => y0(4,1) column vector
np.random.seed(1) ##for repeatable experiments
W1= 2*np.random.random((3,4))-1 ##W1(3,4) signed fractions
w2= 2*np.random.random((4,1))-1 ##w2(4,1) signed fractions
#eps = 0.000000000001 #.0001ok
#W1= 1 - eps*np.random.random((3,4)) ##W1(3,4) nhd of 1 
#w2= 1 - eps*np.random.random((4,1)) ##w2(4,1) nhd of 1 
#W1 = np.zeros((3,4)) ## all 0
#w2 = np.zeros((4,1))
#W1 = np.ones((3,4))  ## all 1
#w2 = np.ones((4,1))
#W1[2][3]= 1. ## contaminators
#w2[1]=-1.
print "initial W1"
print W1
print "initial w2"
print w2
for jj in xrange(600): ## by 600 obvious trend, Trask uses 60,000 
    Y1 = 1/(1+np.exp(-(np.dot(X0,W1)))) #sigma( X0(4,3)W1(3,4) )=Y1(4,4)
    y2 = 1/(1+np.exp(-(np.dot(Y1,w2)))) #sigma( Y1(4,4)w2(4,1) )=y2(4,1)
    #raw_input() ## kludge: wait for keypress
    #print jj  ## watch convergence
    #print  y2 ## output
    #print W1  ## first weight matrix
    #print w2  ## second weight vector
    dy2= (y0-y2)*y2*(1-y2) ## dy2(4,1) arithmetic is termwise
    w2 += Y1.T.dot(dy2) ##backprop Y1(4,4).T dy2(4,1)= dw2(4,1)
    dY1= dy2.dot(w2.T)*Y1*(1-Y1) ## w2.T(1,4) 
    W1 += X0.T.dot(dY1) #backprop X0.T(1,4) dY1(1,4) 
#endfor
print "final W1=" 
print W1
print "final w2="
print w2 
print "outcome Y1="
print Y1
print "outcome y2="  
print y2