function [net, to] = DoBackProp_GPU(inData, t, net, to, para, SHIFT_VECTOR)
%x:       batchSize * n_words, index
%targets: batchSize
%net:     the network to be trained, it should be built and initialized
%to:      gradients
%para:    parameters

nLayers = length(para.NNsize);
nClass  = size(net.codeBook,1);
%%%%%%%input sparse data
batchSize = size(inData,1);
n_words = size(inData,2);
data = [];
for i = 1 : n_words
    data= [data net.codeBook(inData(:,i),:)];
end
%******* Forward
%%Input layer
iLayer = 1;
%x{iLayer} = data * net.w{iLayer} + repmat(net.b{iLayer},[batchSize,1]);
x{iLayer} = bsxfun(@plus, data * net.w{iLayer}, net.b{iLayer});
y{iLayer} = x{iLayer}.*(x{iLayer}>0);%%%ReLUs activation function 
%%%%%%%%%%%%%
%%hidden layer
for iLayer = 2 : nLayers-2
  %activation
  %x{iLayer} = y{iLayer-1} * net.w{iLayer} + repmat(net.b{iLayer},[batchSize,1]);
  x{iLayer} = bsxfun(@plus, y{iLayer-1} * net.w{iLayer}, net.b{iLayer});
  y{iLayer} = x{iLayer}.*(x{iLayer}>0);%%%ReLUs activation function 
end
%%%%%%%%%%%output layer
iLayer = nLayers-1;
%%softmax layer
%x{iLayer} = y{iLayer-1} * net.w{iLayer} + repmat(net.b{iLayer},[batchSize,1]);
x{iLayer} = exp(bsxfun(@plus, y{iLayer-1} * net.w{iLayer}, net.b{iLayer}));
%y{iLayer} = exp(x{iLayer})./repmat(sum(exp(x{iLayer}),2),[1,nClasses]); 
y{iLayer} = bsxfun(@rdivide, x{iLayer},sum(x{iLayer},2));
%%%%%%%%%%%%%%%%%%%% backward
iLayer = nLayers-1;
%targets = sparse(1:batchSize,t,1,batchSize,nClass);
%edelta = y{iLayer}-gpuArray(full(targets)); %softmax with cross entropy.  
edelta = y{iLayer}';
ind = t + SHIFT_VECTOR;
edelta(ind) = (edelta(ind) - 1);
edelta = edelta';
edelta2 = edelta * net.w{iLayer}';
%Update output layer weights  
df_w = y{iLayer-1}' * edelta;
df_b = sum(edelta,1);
to.dw{iLayer} = para.momentum*to.dw{iLayer} - para.lrate*(df_w/batchSize + para.wdr*net.w{iLayer}); 
net.w{iLayer} = net.w{iLayer} + to.dw{iLayer};
to.db{iLayer} = para.momentum*to.db{iLayer} - para.lrate*(df_b/batchSize);
net.b{iLayer} = net.b{iLayer} + to.db{iLayer};
%******* Backprop
for iLayer = nLayers-2:-1:2
  edelta = edelta2 .* (x{iLayer} > 0);
  %signal through weights
  edelta2 = edelta * net.w{iLayer}';
  %%update weights and bias
  df_w = y{iLayer-1}' * edelta;
  df_b = sum(edelta,1);
  to.dw{iLayer} = para.momentum*to.dw{iLayer} - para.lrate*(df_w/batchSize + para.wdr*net.w{iLayer}); 
  net.w{iLayer} = net.w{iLayer} + to.dw{iLayer};
  to.db{iLayer} = para.momentum*to.db{iLayer} - para.lrate*(df_b/batchSize);
  net.b{iLayer} = net.b{iLayer} + to.db{iLayer};
end
%%%%%%%%%Input layer
iLayer = 1;
edelta = edelta2 .* (x{iLayer} > 0);
edelta2 = edelta * net.w{iLayer}';
df_w = data' * edelta;
df_b = sum(edelta,1);
to.dw{iLayer} = para.momentum*to.dw{iLayer} - para.lrate*(df_w/batchSize + para.wdr*net.w{iLayer}); 
net.w{iLayer} = net.w{iLayer} + to.dw{iLayer};
to.db{iLayer} = para.momentum*to.db{iLayer} - para.lrate*(df_b/batchSize);
net.b{iLayer} = net.b{iLayer} + to.db{iLayer};
%%%%change the codeBook
df_w = to.df_w;
for i = 1 : para.n_words
    tmp_inData = zeros(nClass, batchSize, 'gpuArray');
    ind = inData(:,i) + SHIFT_VECTOR;
    tmp_inData(ind) = 1;
    tmp_edelta = edelta2(:,1+(i-1)*para.vecSize: i* para.vecSize ); 
    df_w = df_w + tmp_inData * tmp_edelta;
end
to.wordvec = para.momentum * to.wordvec - para.lrate * (df_w/batchSize + para.wdr * net.codeBook);
net.codeBook = net.codeBook + to.wordvec;