
function [D] = DataProcessing(vecSize, n_words, word2vector_init_method)
%%%
tmp=textread('../data/ptb.map.csv','%s','delimiter','\n');
id=zeros(length(tmp),1);
for i=1:length(tmp)
    tmp2=regexp(tmp{i},',\d+','match');
    tmp3=regexp(tmp2{1},',(\d+)','tokens');
    id(i)=str2num(tmp3{1}{1})+1;    
end
%%%
D.codeBookSize = size(id,1);
D.codeBook = zeros(D.codeBookSize,vecSize);
switch word2vector_init_method
    case 'skip_gram'
         vec = csvread(sprintf('../data/ptb.vector%d.csv',vecSize));
         for i = 1 : D.codeBookSize 
             D.codeBook(id(i),:)= vec(i,:);
         end
    case 'random'
         range = 0.5*sqrt(6/(n_words*vecSize + D.codeBookSize));
         D.codeBook = (rand(D.codeBookSize, vecSize,'single')*2-1) * range;
end
fprintf('create codebook done\n');
%%%%train set
[D.train_total_id, D.train_keep_id] = csvReader('../data/ptb.train.csv',n_words);
fprintf('create train set mat done\n');
%%%%valid set
[D.valid_total_id, D.valid_keep_id] = csvReader('../data/ptb.valid.csv',n_words);
fprintf('create dev set mat done\n');
%%%%test  set
[D.test_total_id, D.test_keep_id] = csvReader('../data/ptb.test.csv',n_words);
fprintf('create test set mat done\n');
%save
save(sprintf('../data/nnlm.ptb%d.n_words%d.mat',vecSize,n_words),'D','-v7.3');
fprintf('save mat done\n');