Idea: Let's start with a network of random weights and biases and then move them in the direction which reduces the error
# numerical python library makes vector math efficient and easy
import numpy as np
s = lambda z: 1/(1+np.exp(-z)) #sigmoid activation function
X = np.array([ [0,0,1],[0,1,1],[1,0,1],[1,1,1] ])
y = np.array([[0,1,1,0]]).T
# first weight matrix (3,4) means it's a 3x4 matrix
syn0 = 2*np.random.random((3,4)) - 1
# second weight matrix (4,1) means it's a 4x1 matrix
syn1 = 2*np.random.random((4,1)) - 1
for j in xrange(60000):
l1 = s(np.dot(X,syn0))
l2 = s(np.dot(l1,syn1))
l2_delta = (y - l2)*(l2*(1-l2))
l1_delta = l2_delta.dot(syn1.T) * (l1 * (1-l1))
syn1 += l1.T.dot(l2_delta)
syn0 += X.T.dot(l1_delta)
Teach computers to see!
Andrej Karpathy is the man...