在这一步中,我们读取源数据,研究其中存在的变量,并查看一些示例数据。这将帮助我们了解数据集中存在的不同列并研究其功能。我们将使用Pandas is library创建数据框,该数据框将在后续步骤中使用。
import pandas as pd #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #https://www.kaggle.com/mlg-ulb/creditcardfraud # Print the top 5 records print(datainput[0:5],"\n") # Print the complete shape of the dataset print("Shape of Complete Data Set") print(datainput.shape,"\n")
Time V1 V2 V3 ... V27 V28 Amount Class 0 0.0 -1.359807 -0.072781 2.536347 ... 0.133558 -0.021053 149.62 0 1 0.0 1.191857 0.266151 0.166480 ... -0.008983 0.014724 2.69 0 2 1.0 -1.358354 -1.340163 1.773209 ... -0.055353 -0.059752 378.66 0 3 1.0 -0.966272 -0.185226 1.792993 ... 0.062723 0.061458 123.50 0 4 2.0 -1.158233 0.877737 1.548718 ... 0.219422 0.215153 69.99 0 [5 rows x 31 columns] Shape of Complete Data Set (284807, 31)
import pandas as pd #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') false = datainput[datainput['Class'] == 1] true = datainput[datainput['Class'] == 0] n = len(false)/float(len(true)) print(n) print('False Detection Cases: {}'.format(len(datainput[datainput['Class'] == 1]))) print('True Detection Cases: {}'.format(len(datainput[datainput['Class'] == 0])),"\n")
0.0017304750013189597 False Detection Cases: 492 True Detection Cases: 284315
import pandas as pd #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #Check for imbalance in data false = datainput[datainput['Class'] == 1] true = datainput[datainput['Class'] == 0] #False Detection Cases print("False Detection Cases") print("----------------------") print(false.Amount.describe(),"\n") #True Detection Cases print("True Detection Cases") print("----------------------") print(true.Amount.describe(),"\n")
False Detection Cases ---------------------- count 492.000000 mean 122.211321 std 256.683288 min 0.000000 25% 1.000000 50% 9.250000 75% 105.890000 max 2125.870000 Name: Amount, dtype: float64 True Detection Cases ---------------------- count 284315.000000 mean 88.291022 std 250.105092 min 0.000000 25% 5.650000 50% 22.000000 75% 77.050000 max 25691.160000 Name: Amount, dtype: float64
import pandas as pd #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #separating features(X) and label(y) # Select all columns except the last for all rows X = datainput.iloc[:, :-1].values # Select the last column of all rows Y = datainput.iloc[:, -1].values print(X.shape) print(Y.shape)
(284807, 30) (284807,)
import pandas as pd from sklearn.model_selection import train_test_split #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #separating features(X) and label(y) X = datainput.iloc[:, :-1].values # Select the last column of all rows Y = datainput.iloc[:, -1].values #train_test_split method X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2)
import pandas as pd from sklearn import metrics from sklearn.model_selection import train_test_split #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #separating features(X) and label(y) X = datainput.iloc[:, :-1].values Y = datainput.iloc[:, -1].values #train_test_split method X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier classifier=DecisionTreeClassifier(max_depth=4) classifier.fit(X_train,Y_train) predicted=classifier.predict(X_test) print("\npredicted values :\n",predicted) #Accuracy DT = metrics.accuracy_score(Y_test, predicted) * 100 print("\nThe accuracy score using the DecisionTreeClassifier : ",DT)
predicted values : [0 0 0 ... 0 0 0] The accuracy score using the DecisionTreeClassifier : 99.9367999719111
import pandas as pd from sklearn import metrics from sklearn.model_selection import train_test_split from sklearn.metrics import precision_score from sklearn.metrics import recall_score from sklearn.metrics import f1_score #Load the creditcard.csv using pandas datainput = pd.read_csv('E:\\creditcard.csv') #separating features(X) and label(y) X = datainput.iloc[:, :-1].values Y = datainput.iloc[:, -1].values #train_test_split method X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2) #DecisionTreeClassifier from sklearn.tree import DecisionTreeClassifier classifier=DecisionTreeClassifier(max_depth=4) classifier.fit(X_train,Y_train) predicted=classifier.predict(X_test) print("\npredicted values :\n",predicted) # # #Accuracy DT = metrics.accuracy_score(Y_test, predicted) * 100 print("\nThe accuracy score using the DecisionTreeClassifier : ",DT) # # #Precision print('precision') # Precision = TP / (TP + FP) (Where TP = True Positive, TN = True Negative, FP = False Positive, FN = False Negative). precision = precision_score(Y_test, predicted, pos_label=1) print(precision_score(Y_test, predicted, pos_label=1)) #Recall print('recall') # Recall = TP / (TP + FN) recall = recall_score(Y_test, predicted, pos_label=1) print(recall_score(Y_test, predicted, pos_label=1)) #f1-score print('f-Score') # F - scores are a statistical method for determining accuracy accounting for both precision and recall. fscore = f1_score(Y_test, predicted, pos_label=1) print(f1_score(Y_test, predicted, pos_label=1))
The accuracy score using the DecisionTreeClassifier : 99.9403110845827 precision 0.810126582278481 recall 0.7710843373493976 f-Score 0.7901234567901234