Student | Student ID |
---|---|
Christopher Sheaffe | z****** |
P**** G**** | z****** |
Y**** H**** | z****** |
In this assignment, you will develop some sub-routines in Python to create useful operations on Bayesian Networks. You will implement an efficient independence test, learn parameters from data, sample from the joint distribution and classify examples. We will use a Bayesian Network for diagnosis of breast cancer.
This notebook requires the following libraries in a Python 3.7 environment to run correctly
use conda install <library>
to install from list below.
#libraries
import numpy as np
import pandas as pd
from itertools import product, combinations
from collections import OrderedDict as odict
from tabulate import tabulate
from copy import deepcopy
import random
from pprint import pprint
import csv
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sn
#graph creation
breastCancerGraph = {
"Age": ["BC"],
"Location": ["BC"],
"BreastDensity": ["Mass"],
"Size": [],
"Mass": ["Shape","Margin","Size"],
"BC": ["Metastasis", "Mass", "MC", "SkinRetract", "NippleDischarge", "AD"],
"Metastasis": ["LymphNodes"],
"LymphNodes": [],
"MC": [],
"SkinRetract": [],
"NippleDischarge": [],
"AD": ["FibrTissueDev"],
"FibrTissueDev": ["SkinRetract","NippleDischarge", "Spiculation"],
"Spiculation": ["Margin"],
"Margin": [],
"Shape": [],
}
# This is the main DFS recursive function
def dfs_r(G, v, colour):
"""
argument
`G`, an adjacency list representation of a graph
`v`, next vertex to be visited
`colour`, dictionary with the colour of each node
"""
#print('Visiting: ', v)
# Visited vertices are coloured 'grey'
colour[v] = 'grey'
# Let's visit all outgoing edges from v
for w in G[v]:
# To avoid loops, we vist check if the next vertex hasn't been visited yet
if colour[w] == 'white':
dfs_r(G, w, colour)
# When we finish the for loop, we know we have visited all nodes from v. It is time to turn it 'black'
colour[v] = 'black'
# This is an auxiliary DFS function to create and initialize the colour dictionary
def dfs(G, start):
"""
argument
`G`, an adjacency list representation of a graph
`start`, starting vertex
"""
# Create a dictionary with keys as node numbers and values equal to 'white'
colour = dict([(node, 'white') for node in G.keys()])
# Call recursive DFS
dfs_r(G, start, colour)
# We can return colour dictionary. It is useful for some operations, such as detecting connected components
return colour
def topologicalSort_r(G, v, colour, stack):
"""
argument
`G`, an adjacency list representation of a graph
`v`, current vertex
`colour`, colouring dictionary
`stack`, list with topological ordering of nodes
"""
colour[v] = 'grey'
for w in G[v]:
if colour[w] == 'white':
topologicalSort_r(G, w, colour, stack)
colour[v] = 'black'
stack.append(v)
def topologicalSort(G, start):
"""
argument
`G`, an adjacency list representation of a graph
`start`, starting vertex
"""
colour = dict([(node, 'white') for node in G.keys()])
# We use a stack to store the topological ordering of the nodes, so we can reverse it later
stack = []
topologicalSort_r(G, start, colour, stack)
return reversed(stack)
def IsDescendant(graph, parent, descendant):
colours = dfs(graph,parent)
if colours.get(descendant,"") == "black":
return True
else:
return False
def IsAncestor(graph, child, ancestor):
colours = dfs(graph,ancestor)
if colours.get(child,"") == "black":
return True
else:
return False
def PathExist(graph, v1, v2, directional = True):
if IsDescendant(graph,v1,v2):
return True
if not directional:
if IsDescendant(graph,v2,v1):
return True
return False
def PathExistSet(graph, set1, set2, directional = True):
for s1 in set1:
for s2 in set2:
if PathExist(graph,s1,s2):
return True
return False
def RemoveNode(graph, node):
graph.pop(node)
for key in graph.keys():
if node in graph[key]:
graph[key].remove(node)
def RemoveLeafNodes(graph, filterSet):
toRemove = []
for key in graph.keys():
if key not in filterSet:
if graph[key] == []:
toRemove.append(key)
for key in toRemove:
RemoveNode(graph,key)
return len(toRemove)
def DirectedGraph2UndirectedGraph(graph):
undirectedGraph = deepcopy(graph)
for node in undirectedGraph.keys():
for child in undirectedGraph[node]:
if node not in undirectedGraph[child]:
undirectedGraph[child].append(node)
return undirectedGraph
def transposeGraph(G): #reverse direction of edges
GT = dict((v, []) for v in G)
for v in G:
for w in G[v]:
if w in GT:
GT[w].append(v)
else:
GT[w] = [v]
return GT
def printFactor(f):
"""
argument
`f`, a factor to print on screen
"""
# Create a empty list that we will fill in with the probability table entries
table = list()
# Iterate over all keys and probability values in the table
for key, item in f['table'].items():
# Convert the tuple to a list to be able to manipulate it
k = list(key)
# Append the probability value to the list with key values
k.append(item)
# Append an entire row to the table
table.append(k)
# dom is used as table header. We need it converted to list
dom = list(f['dom'])
# Append a 'Pr' to indicate the probabity column
dom.append('Pr')
print(tabulate(table,headers=dom,tablefmt='orgtbl'))
Implement the efficient version of the d-separation algorithm in a function d_separation(G,X,Y,Z) that return a boolean: true if X is d-separated from Y given Z and false otherwise. Comment about the time complexity of this procedure.
def d_separation(G,X,Y,Z):
graph = deepcopy(G)
#remove leaves
iterations = 0
while RemoveLeafNodes(graph,X+Y+Z) != 0:
iterations += 1
#remove all outgoing edges for Z
for z in Z:
graph[z] = []
#convert to undirected graph
graph = DirectedGraph2UndirectedGraph(graph)
#if X and Y connected return False
if PathExistSet(graph, X,Y):
return False
else:
return True
# Testing
testGraph = {
"A": ["C"],
"B": ["C","D"],
"C": ["E","F"],
"D": ["E"],
"E": [],
"F": [],
}
#(a) A ⫫ B =T
X = ["A"]
Y = ["B"]
Z = []
print(X, "⫫", Y, "|", Z, " = ", d_separation(testGraph,X,Y,Z))
#(b) A ⫫ D|C =F
X = ["A"]
Y = ["D"]
Z = ["C"]
print(X, "⫫", Y, "|", Z, " = ", d_separation(testGraph,X,Y,Z))
#(c) F ⫫ D|B =T
X = ["F"]
Y = ["D"]
Z = ["B"]
print(X, "⫫", Y, "|", Z, " = ", d_separation(testGraph,X,Y,Z))
#(d) A ⫫ D|F =F
X = ["A"]
Y = ["D"]
Z = ["F"]
print(X, "⫫", Y, "|", Z, " = ", d_separation(testGraph,X,Y,Z))
#(e) D ⫫ F|C =T
X = ["D"]
Y = ["F"]
Z = ["C"]
print(X, "⫫", Y, "|", Z, " = ", d_separation(testGraph,X,Y,Z))
Implement a function learn_bayes_net(G, file, outcomeSpace, prob_tables) that learns the parameters of the Bayesian Network G. This function should output a dictionary prob_tables with the all conditional probability tables (one for each node), as well as the outcomeSpace with the variables domain values.
We are working with a small Bayesian Network with 16 nodes. What will be the size of the joint distribution with all 16 variables?
def prob(factor, *entry):
"""
argument
`factor`, a dictionary of domain and probability values,
`entry`, a list of values, one for each variable in the same order as specified in the factor domain.
Returns p(entry)
"""
return factor['table'][entry] # insert your code here, 1 line
def allEqualThisIndex(dict_of_arrays, **fixed_vars):
"""
Helper function to create a boolean index vector into a tabular data structure,
such that we return True only for rows of the table where, e.g.
column_a=fixed_vars['column_a'] and column_b=fixed_vars['column_b'].
This is a simple task, but it's not *quite* obvious
for various obscure technical reasons.
It is perhaps best explained by an example.
>>> all_equal_this_index(
... {'X': [1, 1, 0], Y: [1, 0, 1]},
... X=1,
... Y=1
... )
[True, False, False]
"""
# base index is a boolean vector, everywhere true
first_array = dict_of_arrays[list(dict_of_arrays.keys())[0]]
index = np.ones_like(first_array, dtype=np.bool_)
for var_name, var_val in fixed_vars.items():
index = index & (np.asarray(dict_of_arrays[var_name])==var_val)
return index
def estProbTable(data, var_name, parent_names, outcomeSpace):
"""
Calculate a dictionary probability table by ML given
`data`, a dictionary or dataframe of observations
`var_name`, the column of the data to be used for the conditioned variable and
`var_outcomes`, a tuple of possible outcomes for the conditiona varible and
`parent_names`, a tuple of columns to be used for the parents and
`parent_outcomes` a tuple of all possible parent outcomes
Return a dictionary containing an estimated conditional probability table.
"""
var_outcomes = outcomeSpace[var_name]
parent_outcomes = [outcomeSpace[var] for var in (parent_names)]
# cartesian product to generate a table of all possible outcomes
all_parent_combinations = product(*parent_outcomes)
prob_table = odict()
for i, parent_combination in enumerate(all_parent_combinations):
cond_array = []
parent_vars = dict(zip(parent_names, parent_combination))
parent_index = allEqualThisIndex(data, **parent_vars)
for var_outcome in var_outcomes:
var_index = (np.asarray(data[var_name])==var_outcome)
prob_table[tuple(list(parent_combination)+[var_outcome])] = (var_index & parent_index).sum()/parent_index.sum()
return {'dom': tuple(list(parent_names)+[var_name]), 'table': prob_table}
def learn_bayes_net(G, file, outcomeSpace, prob_tables):
with open(file) as f:
#load data
data = pd.read_csv(f)
data.head()
#load domains for each feature
for col in data.columns:
domain = data[col].unique()
outcomeSpace[col] = list(domain)
#estimate probabilites for each node
#first reverse edge direction
graphT = transposeGraph(G)
#prob_tables = odict()
for node, parents in graphT.items():
prob_tables[node] = estProbTable(data,node,parents,outcomeSpace)
#testing
outcomeSpace = {}
prob_tables = {}
learn_bayes_net(breastCancerGraph, "bc 2.csv", outcomeSpace, prob_tables)
for f in prob_tables.keys():
print("*"*50)
buffer = int((50-(len(f)+2))/2)
print("*"*buffer + " " + f + " " + "*"*(50-buffer-len(f)-2))
print("*"*50)
printFactor(prob_tables[f])
print("\n")
Use forward sampling to generate 1000 samples from the Breast Cancer Bayesian Network. Comment about the time complexity of the procedure and accuracy of the estimates. What happens as you add more observed variables in the query in terms of accuracy and effective sample size?
def generate_instances(G, GTopo, prob_tables, number_of_instances):
GT = transposeGraph(G)
#GTopo = topologicalSort(G) left out until know how to get root start node always
instances = []
for instance in range (number_of_instances):
tempInstance = generate_instance(GT, prob_tables)
instances.append(tempInstance)
headers = [list(instances[0].keys())]
data = headers + [list(i.values()) for i in instances]
return data
def generate_instance(GT, prob_tables):
instance = {}
for node in GT:
generate_variable(node, prob_tables, GT, instance)
return instance
def generate_variable(node, prob_tables, GT, instance):
randomValue = random.uniform(0,1)
parents = GT[node]
#check if parents have been set
for parent in parents:
if parent not in instance:
generate_variable(parent, prob_tables, GT, instance)
if node not in instance:
generate_random_variable(parents, node, prob_tables, instance)
def generate_random_variable(parents, node, prob_tables, instance):
parentValues = [instance[val] for val in parents]
randomVal = random.uniform(0,1)
assignedVal = None
items = [list(key) +[item] for key, item in prob_tables[node]["table"].items()]
new_list = [i for i in items if i[:len(parentValues)] == parentValues]
#new_list.sort(key=lambda x: x[-1])
runningProb = 0
for x in new_list:
runningProb+=float(x[-1])
if randomVal < runningProb:
assignedVal = x[-2]
break
instance[node] = assignedVal
#Testing
#Convert to topological graph starting with "Location"
breastCancerGraphTopo = topologicalSort(breastCancerGraph, "Location")
#generate 10000 instances and save result
generatedInstances = generate_instances(breastCancerGraph, breastCancerGraphTopo, prob_tables,10000)
with open('generated_instances.csv', 'w', newline='') as writeFile:
writer = csv.writer(writeFile)
writer.writerows(generatedInstances)
#test generated instances by using the learn_bayes_net function and comparing the generated prob_tables with the originals
outcomeSpace = {}
prob_tables = {}
learn_bayes_net(breastCancerGraph, "generated_instances.csv", outcomeSpace, prob_tables)
for f in prob_tables.keys():
print("*"*50)
buffer = int((50-(len(f)+2))/2)
print("*"*buffer + " " + f + " " + "*"*(50-buffer-len(f)-2))
print("*"*50)
printFactor(prob_tables[f])
print("\n")
Use the Bayesian Network to classify cases of the dataset. Propose an experimental setup to estimate the classification error. Compare the classification error of the Bayesian Network with your favourite Machine Learning classifier.
Used ensemble learning and reached ~90% accuracy more information (source code) is provided at the bottom of this notebook.
def join(f1, f2, outcomeSpace):
"""
argument
`f1`, first factor to be joined.
`f2`, second factor to be joined.
`outcomeSpace`, dictionary with the domain of each variable
Returns a new factor with a join of f1 and f2
"""
if f1 == {}:
return f2
# First, we need to determine the domain of the new factor. It will be union of the domain in f1 and f2
# But it is important to eliminate the repetitions
common_vars = list(f1['dom']) + list(set(f2['dom']) - set(f1['dom']))
# We will build a table from scratch, starting with an empty list. Later on, we will transform the list into a odict
table = list()
# Here is where the magic happens. The product iterator will generate all combinations of varible values
# as specified in outcomeSpace. Therefore, it will naturally respect observed values
for entries in product(*[outcomeSpace[node] for node in common_vars]):
# We need to map the entries to the domain of the factors f1 and f2
entryDict = dict(zip(common_vars, entries))
f1_entry = (entryDict[var] for var in f1['dom'])
f2_entry = (entryDict[var] for var in f2['dom'])
# Insert your code here
p1 = prob(f1, *f1_entry) # Use the fuction prob to calculate the probability in factor f1 for entry f1_entry
p2 = prob(f2, *f2_entry) # Use the fuction prob to calculate the probability in factor f2 for entry f2_entry
# Create a new table entry with the multiplication of p1 and p2
table.append((entries, p1 * p2))
return {'dom': tuple(common_vars), 'table': odict(table)}
def p_joint(outcomeSpace, cond_tables):#=cond_tables_ml):
"""
argument
`outcomeSpace`, dictionary with domain of each variable
`cond_tables`, conditional probability distributions estimated from data
Returns a new factor with full joint distribution
"""
p = {}
for table in cond_tables.keys():
p = join(p, cond_tables[table], outcomeSpace)
return p
def evidence(var, e, outcomeSpace):
"""
argument
`var`, a valid variable identifier.
`e`, the observed value for var.
`outcomeSpace`, dictionary with the domain of each variable
Returns dictionary with a copy of outcomeSpace with var = e
"""
newOutcomeSpace = outcomeSpace.copy() # Make a copy of outcomeSpace with a copy to method copy(). 1 line
newOutcomeSpace[var] = (e,) # Replace the domain of variable var with a tuple with a single element e. 1 line
return newOutcomeSpace
def marginalize(f, var, outcomeSpace):
"""
argument
`f`, factor to be marginalized.
`var`, variable to be summed out.
`outcomeSpace`, dictionary with the domain of each variable
Returns a new factor f' with dom(f') = dom(f) - {var}
"""
# Let's make a copy of f domain and convert it to a list. We need a list to be able to modify its elements
new_dom = list(f['dom'])
#########################
# Insert your code here #
#########################
new_dom.remove(var) # Remove var from the list new_dom by calling the method remove(). 1 line
table = list() # Create an empty list for table. We will fill in table from scratch. 1 line
for entries in product(*[outcomeSpace[node] for node in new_dom]):
s = 0; # Initialize the summation variable s. 1 line
# We need to iterate over all possible outcomes of the variable var
for val in outcomeSpace[var]:
# To modify the tuple entries, we will need to convert it to a list
entriesList = list(entries)
# We need to insert the value of var in the right position in entriesList
entriesList.insert(f['dom'].index(var), val)
#########################
# Insert your code here #
#########################
p = prob(f, *tuple(entriesList)) # Calculate the probability of factor f for entriesList. 1 line
s = s + p # Sum over all values of var by accumulating the sum in s. 1 line
# Create a new table entry with the multiplication of p1 and p2
table.append((entries, s))
return {'dom': tuple(new_dom), 'table': odict(table)}
def normalize(f):
"""
argument
`f`, factor to be normalized.
Returns a new factor f' as a copy of f with entries that sum up to 1
"""
table = list()
sum = 0
for k, p in f['table'].items():
sum = sum + p
for k, p in f['table'].items():
table.append((k, p/sum))
return {'dom': f['dom'], 'table': odict(table)}
def query(p, outcomeSpace, q_vars, **q_evi):
"""
argument
`p`, probability table to query.
`outcomeSpace`, dictionary will variable domains
`q_vars`, list of variables in query head
`q_evi`, dictionary of evidence in the form of variables names and values
Returns a new factor NORMALIZED factor will all hidden variables eliminated as evidence set as in q_evi
"""
# Let's make a copy of these structures, since we will reuse the variable names
pm = p.copy()
outSpace = outcomeSpace.copy()
# First, we set the evidence
for var_evi, e in q_evi.items():
outcomeSpace = evidence(var_evi, e, outcomeSpace)# Set the evidence var_evi = e. 2 lines
# Second, we eliminate hidden variables NOT in the query
for var in outcomeSpace:
if not var in q_vars:
pm = marginalize(pm,var,outcomeSpace)
# Marginalize to eliminate variable var. 3 lines
# Third, return a normalized factor with the query answer
return normalize(pm)
#hard coded to BC graph
def QueryOnDataFrameRow(row, p, outcomeSpace,):
#hardcoded to BC
q = query( p,
outcomeSpace,
'BC',
Age=row["Age"],
Location=row["Location"],
BreastDensity=row["BreastDensity"],
Size=row["Size"],
Mass=row["Mass"],
#BC=row["BC"],
Metastasis=row["Metastasis"],
LymphNodes=row["LymphNodes"],
MC=row["MC"],
SkinRetract=row["SkinRetract"],
NippleDischarge=row["NippleDischarge"],
AD=row["AD"],
FibrTissueDev=row["FibrTissueDev"],
Spiculation=row["Spiculation"],
Margin=row["Margin"],
Shape=row["Shape"],
)
preds = [list(key) +[item] for key, item in q["table"].items()]
#preds = q["table"].items()
preds.sort(key=lambda x: x[-1])
y_pred = preds[-1][0]
confidence = preds[-1][1]
return y_pred, confidence
def PredictOnDataFrame(df,p,outcomeSpace):
preds = df.apply(lambda row: QueryOnDataFrameRow(row,p,outcomeSpace), axis=1)
print(preds)
df["y_pred"], df['confidence'] = zip(*preds)
return df
def PredictOnFile(file, outfile, p, outcomeSpace):
with open(file) as f:
data = pd.read_csv(f)
data = PredictOnDataFrame(data,p,outcomeSpace)
data.to_csv(outfile, index=False)
return data
# Testing
p = p_joint(outcomeSpace, prob_tables) #takes a long time
#Testing
print(query(p, outcomeSpace, 'BC', Shape= "Other"))
#Testing
predictionsDF = PredictOnFile("bc 2.csv", "bc 2 predictions.csv", p, outcomeSpace)
cm = metrics.confusion_matrix(predictionsDF["BC"],predictionsDF["y_pred"])
print(metrics.classification_report(predictionsDF["BC"],predictionsDF["y_pred"]))
#plt.matshow(cm)
#plt.title('Confusion matrix of the classifier')
#plt.colorbar()
#plt.show()
df = pd.DataFrame(predictionsDF, columns=['BC','y_pred'])
confusion_matrix = pd.crosstab(df['BC'], df['y_pred'], rownames=['Actual'], colnames=['Predicted'], margins = True)
sn.set(font_scale=1)
sn.heatmap(confusion_matrix,linewidths=2,)
Write a two-page report (around 1000 words) summarising your findings in this assignment. Some suggestions for the report are:
Answers are provided in the PDF report.
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
data = pd.read_csv("bc 2.csv")
data = data.reindex(sorted(data.columns), axis=1)
originalData = data.copy()
labels = data["BC"]
data.head()
originalData.describe()
#load domains for each feature
for col in data.columns:
domain = data[col].unique()
print(col,domain)
Age, BreastDensity, Size are Ordinal
AD, FibrTissueDev, LymphNodes, MC, Margin, Metastasis, NippleDischarge, SkinRetract, Spiculation are binary
BC, Location, Mass, Shape need to be OHE (BC IS LABEL DATA SO CAN BE REMOVED)
#one hot encoding
data = pd.get_dummies(data, columns = ['Location','Shape'])
data.head()
#ordinal
data.loc[data['Age'] == "<35", 'Age'] = 0
data.loc[data['Age'] == "35-49", 'Age'] = 1
data.loc[data['Age'] == "50-74", 'Age'] = 2
data.loc[data['Age'] == ">75", 'Age'] = 3
data.loc[data['BreastDensity'] == "low", 'BreastDensity'] = 0
data.loc[data['BreastDensity'] == "medium", 'BreastDensity'] = 1
data.loc[data['BreastDensity'] == "high", 'BreastDensity'] = 2
data.loc[data['Size'] == "<1cm", 'Size'] = 0
data.loc[data['Size'] == "1-3cm", 'Size'] = 1
data.loc[data['Size'] == ">3cm", 'Size'] = 2
#categorical boolean mask
categorical_feature_mask = data.dtypes==object
categorical_feature_mask
# filter categorical columns using mask and turn it into a list
categorical_cols = data.columns[categorical_feature_mask].tolist()
categorical_cols
# import labelencoder
from sklearn.preprocessing import LabelEncoder
#instantiate labelencoder object
le = LabelEncoder()
# apply le on categorical feature columns
data[categorical_cols] = data[categorical_cols].apply(lambda col: le.fit_transform(col))
data[categorical_cols].head(10)
#import labelencoder
#ensure original data has not changed
originalData.head()
data.head()
# Compute the correlation matrix
corr = data.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype=np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
f, ax = plt.subplots(figsize=(11, 9))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(220, 10, as_cmap=True)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask=mask, cmap=cmap, vmax=.3, center=0,
square=True, linewidths=.5, cbar_kws={"shrink": .5})
plt.show()
new_data = data.drop([
'BC',
], axis=1)
new_data.head()
from sklearn.model_selection import train_test_split
X_train,X_val,y_train,y_val = train_test_split(new_data,labels,test_size=0.33)
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler() #Instantiate the scaler
scaled_X_train = scaler.fit_transform(X_train) #Fit and transform the data
scaled_X_val = scaler.transform(X_val) #Fit and transform the validation set using the MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
clf_dt = DecisionTreeClassifier().fit(scaled_X_train,y_train)
y_pred_dt = clf_dt.predict(scaled_X_val)
from sklearn.naive_bayes import GaussianNB, BernoulliNB
clf_nb = BernoulliNB().fit(scaled_X_train,y_train)
y_pred_nb = clf_nb.predict(scaled_X_val)
from sklearn.svm import SVC
clf_svm = SVC(gamma=1, C=1000,probability=True).fit(scaled_X_train,y_train)
y_pred_svm = clf_svm.predict(scaled_X_val)
from sklearn.ensemble import RandomForestClassifier
clf_rf = RandomForestClassifier().fit(scaled_X_train,y_train)
y_pred_rf = clf_rf.predict(scaled_X_val)
from sklearn.ensemble import AdaBoostClassifier
clf_ada = AdaBoostClassifier(n_estimators=100, random_state =0).fit(scaled_X_train, y_train)
y_pred_ada = clf_ada.predict(scaled_X_val)
from sklearn.neural_network import MLPClassifier
clf_mlp = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(1000,), random_state=1).fit(scaled_X_train, y_train)
y_pred_mlp = clf_mlp.predict(scaled_X_val)
(Majority Voting)
from sklearn.ensemble import VotingClassifier
clf_ensemble = VotingClassifier(estimators=[('dt', clf_dt), ('svm', clf_svm), ('rf', clf_rf), ('ada', clf_ada), ('mlp', clf_mlp)], voting='hard').fit(scaled_X_train,y_train)
y_pred_ensemble = clf_ensemble.predict(scaled_X_val)
from sklearn.metrics import accuracy_score
acc_rf = accuracy_score(y_val, y_pred_rf)
acc_svm = accuracy_score(y_val, y_pred_svm)
acc_dt = accuracy_score(y_val, y_pred_dt)
acc_nb = accuracy_score(y_val, y_pred_nb)
acc_ada = accuracy_score(y_val, y_pred_ada)
acc_mlp = accuracy_score(y_val, y_pred_mlp)
acc_ensemble = accuracy_score(y_val, y_pred_ensemble)
print("The accuracy of Decision Tree: {} %".format(acc_dt*100))
print("The accuracy of Bernoulli Naive Bayes: {} %".format(acc_nb*100))
print("The accuracy of SVM: {} %".format(acc_svm*100))
print("The accuracy of RF: {} %".format(acc_rf*100))
print("The accuracy of Ada Boost: {} %".format(acc_ada*100))
print("The accuracy of Multi-layer Perceptron: {} %".format(acc_mlp*100))
print("The accuracy of Ensemble Learning: {} %".format(acc_ensemble*100))
from sklearn.metrics import precision_score
prec_dt = precision_score(y_val,y_pred_dt,average='weighted')
prec_nb = precision_score(y_val,y_pred_nb,average='weighted')
prec_rf = precision_score(y_val,y_pred_rf,average='weighted')
prec_svm = precision_score(y_val,y_pred_svm,average='weighted')
prec_ada = precision_score(y_val,y_pred_ada,average='weighted')
prec_mlp = precision_score(y_val,y_pred_mlp,average='weighted')
prec_ensemble = precision_score(y_val,y_pred_ensemble,average='weighted')
print("The precision of Decision Tree: {} %".format(prec_dt*100))
print("The precision of Bernoulli Naive Bayes: {} %".format(prec_nb*100))
print("The precision of SVM: {} %".format(prec_svm*100))
print("The precision of Random Forest: {} %".format(prec_rf*100))
print("The precision of Ada Boost: {} %".format(prec_ada*100))
print("The precision of Multi-layer Perceptron: {} %".format(prec_mlp*100))
print("The precision of Ensemble Learning: {} %".format(prec_ensemble*100))
from sklearn.metrics import recall_score
recall_dt = recall_score(y_val,y_pred_dt,average='weighted')
recall_nb = recall_score(y_val,y_pred_nb,average='weighted')
recall_rf = recall_score(y_val,y_pred_rf,average='weighted')
recall_svm = recall_score(y_val,y_pred_svm,average='weighted')
recall_ada = recall_score(y_val,y_pred_ada,average='weighted')
recall_mlp = recall_score(y_val,y_pred_mlp,average='weighted')
recall_ensemble = recall_score(y_val,y_pred_ensemble,average='weighted')
print("The recall of Decision Tree: {} %".format(recall_dt*100))
print("The recall of Bernoulli Naive Bayes: {} %".format(recall_nb*100))
print("The recall of SVM: {} %".format(recall_svm*100))
print("The recall of Random Forest: {} %".format(recall_rf*100))
print("The recall of Ada Boost: {} %".format(recall_ada*100))
print("The recall of Multi-layer Perceptron: {} %".format(recall_mlp*100))
print("The recall of Ensemble Learning: {} %".format(recall_ensemble*100))
from sklearn.metrics import f1_score
f1_dt = f1_score(y_pred_dt,y_val,average='weighted')
f1_nb = f1_score(y_pred_nb,y_val,average='weighted')
f1_svm = f1_score(y_pred_svm,y_val,average='weighted')
f1_rf = f1_score(y_pred_rf,y_val,average='weighted')
f1_ada = f1_score(y_pred_ada,y_val,average='weighted')
f1_mlp = f1_score(y_pred_mlp,y_val,average='weighted')
f1_ensemble = f1_score(y_pred_ensemble,y_val,average='weighted')
print("The F1-score of Decision Tree: {} %".format(f1_dt*100))
print("The F1-score of Bernoulli Naive Bayes: {} %".format(f1_nb*100))
print("The F1-score of SVM: {} %".format(f1_svm*100))
print("The F1-score of Random Forest: {} %".format(f1_rf*100))
print("The F1-score of Ada Boost: {} %".format(f1_ada*100))
print("The F1-score of Multi-layer Perceptron: {} %".format(f1_mlp*100))
print("The F1-score of Ensemble Learning: {} %".format(f1_ensemble*100))