Friday 11 March 2016

Quick Dive to data handeling and machine learning in Python


This model show a quick way to apply different classification machine learning algorithm on data strored in CSV formate.
Python provide very powerful tools like pandas, numpy, and sklearn. we exploit them a lot.

 Source Identification using 2 Class of image

In [72]:
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
import numpy as np
import pandas as pd
# matplotlib.pyplot as plt

Understanding Data

In [2]:
sony=pd.read_csv("Sony_NONIQM.csv")
sony.head()
73













































































In [3]:
nikon=pd.read_csv("Nikons_NONIQM.csv")
nikon.head()
Out[3]:














































































In [4]:
nikon.describe()
Out[4]:




















































































































In [5]:
sony.describe()
Out[5]:





















































































































Create trainig and testing data Frame

Add Label Coloumn

In [6]:
sony.loc[:,'label'] = pd.Series(0, index=sony.index)
In [7]:
sony.head()
Out[7]:



















































































In [8]:
nikon.loc[:,'label'] = pd.Series(1, index=nikon.index)
nikon.head()
Out[8]:




















































































Join Both data set & Normalise

In [9]:
data=pd.concat([sony,nikon])
data[140:148]
Out[9]:






























































































































In [10]:
#select all coulomn except label
data[1:3].loc[:,data.columns != 'label']
Out[10]:
Var1 Var2 Var3 Var4 Var5 Var6 Var7 Var8 Var9 Var10 Var11 Var12
1 125.036287 126.017584 126.640565 0.995081 1.007848 1.012831 -1097190 -1068678 -753861 0.992517 0.984211 0.974200
2 109.158400 106.371291 104.290264 1.019954 0.974467 0.955403 157 641 -3146 0.944636 0.975875 0.884661
In [11]:
Norm_feature=normalize(data.loc[:,data.columns != 'label'], norm='l2', axis=1, copy=True)
In [12]:
Norm_feature
Out[12]:
array([[  3.52823583e-03,   3.62586141e-03,   3.26940692e-03, ...,
          3.01275055e-05,   3.00148129e-05,   2.96453648e-05],
       [  7.32446682e-05,   7.38194998e-05,   7.41844344e-05, ...,
          5.81403910e-07,   5.76538359e-07,   5.70674174e-07],
       [  3.39024810e-02,   3.30368593e-02,   3.23905326e-02, ...,
          2.93385469e-04,   3.03087691e-04,   2.74758635e-04],
       ..., 
       [  4.36099705e-04,   4.24614258e-04,   4.01317458e-04, ...,
          3.55435779e-06,   3.55703070e-06,   3.42947537e-06],
       [  4.87460414e-04,   5.01475208e-04,   4.82155777e-04, ...,
          4.23647775e-06,   4.26887807e-06,   4.04420111e-06],
       [  1.74080612e-04,   1.81601297e-04,   1.78560484e-04, ...,
          1.70667166e-06,   1.69572507e-06,   1.63680092e-06]])

Create train and test

In [13]:
#concatenate numpy array

#training Set
train_x=np.concatenate((Norm_feature[15:144], Norm_feature[160:]), axis=0)
train_y=np.concatenate((np.array(data[15:144].loc[:,"label"]),np.array(data[160:].loc[:,"label"])),axis=0)

#Testing Set
test_x=np.concatenate((Norm_feature[0:15], Norm_feature[144:160]), axis=0)
test_y=np.concatenate((np.array(data[0:15].loc[:,"label"]),np.array(data[144:160].loc[:,"label"])),axis=0)

SVM Classifier

In [14]:
# clsf = classifier
clsf=SVC(kernel='rbf',gamma=10,C=1) #SVM Classier
svm={} #store accuracy data for plotting
In [15]:
#training
clsf.fit(train_x, train_y)
Out[15]:
SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma=10, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

Testing accuracy on training data

In [16]:
#Now testing for trainig data
#Now predict values for given classifier
prediction = clsf.predict(train_x)
svm["trainig_set"]=accuracy_score(prediction,train_y)*100
print 'Accuracy Check ',svm["trainig_set"],'%'
Accuracy Check  82.7450980392 %

Testing accuracy on Testing data

In [17]:
prediction = clsf.predict(test_x)
svm["test_set"]=accuracy_score(prediction,test_y)*100
print 'Accuracy Check ',svm["test_set"],'%'
Accuracy Check  74.1935483871 %
In [18]:
svm
Out[18]:
{'test_set': 74.193548387096769, 'trainig_set': 82.745098039215677}

Decision Tree Classifier

In [19]:
decision={}
clf=DecisionTreeClassifier(max_depth=10,min_samples_split=4)
clf
Out[19]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=10,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=4, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
In [20]:
#train
clf = clf.fit(train_x, train_y)

Testing accuracy on training data

In [21]:
#Now testing for trainig data
#Now predict values for given classifier
prediction = clf.predict(train_x)
decision["trainig_set"]=accuracy_score(prediction,train_y)*100
print 'Accuracy Check ',decision["trainig_set"],'%'
Accuracy Check  98.0392156863 %

Testing accuracy on Testing data

In [22]:
prediction = clf.predict(test_x)
decision["test_set"]=accuracy_score(prediction,test_y)*100
print 'Accuracy Check ',decision["test_set"],'%'
Accuracy Check  80.6451612903 %
In [23]:
decision
Out[23]:
{'test_set': 80.645161290322577, 'trainig_set': 98.039215686274503}

K-mean Clustering

In [24]:
from sklearn.cluster import KMeans
In [25]:
kmean={}
clust=KMeans(n_clusters=2,n_init=1)
clust
Out[25]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [26]:
clust.fit(train_x)
Out[26]:
KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=2, n_init=1,
    n_jobs=1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)
In [27]:
def accuraccy(pred,expected):
    #calculate positive and negative prediction
    pos=0
    for i in pred:
        if i==expected:
            pos+=1
    return pos
        

Testing accuracy on training data

In [28]:
#defining class then cheking class
expected=clust.predict(train_x[0].reshape(1,12))[0]
pos=accuraccy(clust.predict(train_x[:128]),expected)

expected2=clust.predict(train_x[-1].reshape(1,12))[0]
if expected2==expected:
    expected2=clust.predict(train_x[-5].reshape(1,12))[0]
    if expected2==expected:
        print "Error in class , fix it mannually"
else:
    pos+=accuraccy(clust.predict(train_x[128:]),expected2)
kmean["trainig_set"]=(pos/float(len(train_y)))*100
print 'Accuracy Check ',kmean["trainig_set"],'%'
Accuracy Check  67.8431372549 %

Testing accuracy on testing data

In [29]:
#defining class then cheking class
expected=clust.predict(test_x[0].reshape(1,12))[0]
pos=accuraccy(clust.predict(test_x[:15]),expected)

expected2=clust.predict(test_x[-1].reshape(1,12))[0]
if expected2==expected:
    expected2=clust.predict(train_x[-5].reshape(1,12))[0]
    if expected2==expected:
        print "Error in class , fix it mannually"
else:
    pos+=accuraccy(clust.predict(train_x[144:160]),expected2)
kmean["test_set"]=(pos/float(len(test_y)))*100
print 'Accuracy Check ',kmean["test_set"],'%'
Accuracy Check  67.7419354839 %
In [30]:
kmean
Out[30]:
{'test_set': 67.74193548387096, 'trainig_set': 67.84313725490196}

Neural Net Regressor

  • Uses Backpropagation for trainig
In [38]:
from sklearn.neural_network import MLPClassifier
In [63]:
neural=dict()
clf = MLPClassifier(algorithm='l-bfgs',hidden_layer_sizes=(50,), alpha=1e-5, random_state=1)
clf
Out[63]:
MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
       batch_size=200, beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)
In [64]:
#trainig Network
clf.fit(train_x, train_y)
Out[64]:
MLPClassifier(activation='relu', algorithm='l-bfgs', alpha=1e-05,
       batch_size=200, beta_1=0.9, beta_2=0.999, early_stopping=False,
       epsilon=1e-08, hidden_layer_sizes=(50,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
       tol=0.0001, validation_fraction=0.1, verbose=False,
       warm_start=False)

Testing accuracy on training data

In [65]:
#Now testing for trainig data
#Now predict values for given classifier
prediction = clf.predict(train_x)
neural["trainig_set"]=accuracy_score(prediction,train_y)*100
print 'Accuracy Check ',neural["trainig_set"],'%'
Accuracy Check  88.6274509804 %

Testing accuracy on testing data

In [66]:
prediction = clf.predict(test_x)
neural["test_set"]=accuracy_score(prediction,test_y)*100
print 'Accuracy Check ',neural["test_set"],'%'
Accuracy Check  80.6451612903 %
In [67]:
neural
Out[67]:
{'test_set': 80.645161290322577, 'trainig_set': 88.627450980392155}

Ploting

In [76]:
import pylab as plt
plt.rcParams['figure.figsize'] = 16, 12
In [77]:
#%pylab inline
classifier = [1,2,3,4]
accuracy_test_data=[svm["test_set"],decision["test_set"],kmean["test_set"],neural["test_set"]]
LABELS=["SVM","Decision-Tree","K-Mean","Neural-net"]

plt.bar(classifier,accuracy_test_data,align='center')
plt.xticks(classifier, LABELS) #binding label with x axis data
plt.xlabel('Classifier')
plt.ylabel('Accuracy')
plt.title('Analysis with 2 Class')
plt.show()
                    

1 comment:

  1. Hello,
    The Article on Quick Dive to data handeling and machine learning in Python is nice.It give detail information about it .Thanks for Sharing the information about Machine Learning. hire data scientists

    ReplyDelete

THANKS FOR UR GREAT COMMENT

Blogger Widgets