-
Notifications
You must be signed in to change notification settings - Fork 20
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
4 changed files
with
194 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,90 @@ | ||
import pandas as pd | ||
import numpy as np | ||
|
||
import rf_model | ||
|
||
|
||
class feature_extractor: | ||
|
||
def __init__(self,url:str): | ||
self.input_url = url | ||
|
||
def long_url(self,l): | ||
"""This function is defined in order to differntiate website based on the length of the URL""" | ||
l= str(l) | ||
if len(l) < 54: | ||
return 0 | ||
elif len(l) >= 54 and len(l) <= 75: | ||
return 2 | ||
return 1 | ||
|
||
def have_at_symbol(self,l): | ||
"""This function is used to check whether the URL contains @ symbol or not""" | ||
if "@" in str(l): | ||
return 1 | ||
return 0 | ||
|
||
def redirection(self,l): | ||
"""If the url has symbol(//) after protocol then such URL is to be classified as phishing """ | ||
if "//" in str(l): | ||
return 1 | ||
return 0 | ||
|
||
def prefix_suffix_seperation(self,l): | ||
"""seprate prefix and suffix""" | ||
if '-' in str(l): | ||
return 1 | ||
return 0 | ||
|
||
def sub_domains(self,l): | ||
"""check the subdomains""" | ||
l= str(l) | ||
if l.count('.') < 3: | ||
return 0 | ||
elif l.count('.') == 3: | ||
return 2 | ||
return 1 | ||
|
||
|
||
def extract(self): | ||
print("in script 2") | ||
input_data = [{"URL":self.input_url}] | ||
print('input taken') | ||
temp_df = pd.DataFrame(input_data) | ||
print("dataframe created") | ||
#expand argument in the split method will give you a new column | ||
seperation_of_protocol = temp_df['URL'].str.split("://",expand = True) | ||
print("step 1 done") | ||
#split(seperator,no of splits according to seperator(delimiter),expand) | ||
seperation_domain_name = seperation_of_protocol[1].str.split("/",1,expand = True) | ||
print("step 2 done") | ||
#renaming columns of data frame | ||
seperation_domain_name.columns=["domain_name","address"] | ||
print("step 3 done") | ||
#Concatenation of data frames | ||
splitted_data = pd.concat([seperation_of_protocol[0],seperation_domain_name],axis=1) | ||
print("step 4 done") | ||
|
||
splitted_data.columns = ['protocol','domain_name','address'] | ||
print("step 5 done") | ||
|
||
#splitted_data['is_phished'] = pd.Series(temp_df['Target'], index=splitted_data.index) | ||
#print("step 6 done") | ||
|
||
"""feature extraction starts here""" | ||
#Applying the above defined function in order to divide the websites into 3 categories | ||
splitted_data['long_url'] = temp_df['URL'].apply(self.long_url) | ||
print("feature extra 1") | ||
splitted_data['having_@_symbol'] = temp_df['URL'].apply(self.have_at_symbol) | ||
print("feature extra 2") | ||
splitted_data['redirection_//_symbol'] = seperation_of_protocol[1].apply(self.redirection) | ||
print("feature extra 3") | ||
splitted_data['prefix_suffix_seperation'] = seperation_domain_name['domain_name'].apply(self.prefix_suffix_seperation) | ||
print("feature extra 4") | ||
splitted_data['sub_domains'] = splitted_data['domain_name'].apply(self.sub_domains) | ||
print("feature extra 5") | ||
#splitted_data.to_csv(r'dataset3.csv',header= True) | ||
|
||
|
||
|
||
return rf_model.predictor(splitted_data) |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,69 @@ | ||
from PyQt5 import QtCore, QtGui, QtWidgets | ||
import feature_extractor | ||
|
||
class Ui_Spam_detector(object): | ||
def setupUi(self, Spam_detector): | ||
Spam_detector.setObjectName("Spam_detector") | ||
Spam_detector.resize(521, 389) | ||
self.centralwidget = QtWidgets.QWidget(Spam_detector) | ||
self.centralwidget.setObjectName("centralwidget") | ||
|
||
"""check button code and its connectivity to button_click function""" | ||
self.check_button = QtWidgets.QPushButton(self.centralwidget) | ||
self.check_button.setGeometry(QtCore.QRect(210, 170, 93, 28)) | ||
self.check_button.setObjectName("check_button") | ||
self.check_button.clicked.connect(self.button_click) | ||
|
||
"""url input section""" | ||
self.url_input = QtWidgets.QLineEdit(self.centralwidget) | ||
self.url_input.setGeometry(QtCore.QRect(70, 111, 431, 31)) | ||
self.url_input.setObjectName("url_input") | ||
|
||
self.label = QtWidgets.QLabel(self.centralwidget) | ||
self.label.setGeometry(QtCore.QRect(20, 110, 81, 31)) | ||
self.label.setObjectName("label") | ||
|
||
"""output message""" | ||
self.output_text = QtWidgets.QTextEdit(self.centralwidget) | ||
self.output_text.setGeometry(QtCore.QRect(30, 241, 461, 121)) | ||
self.output_text.setObjectName("output_text") | ||
|
||
self.label_2 = QtWidgets.QLabel(self.centralwidget) | ||
self.label_2.setGeometry(QtCore.QRect(110, 10, 311, 41)) | ||
self.label_2.setObjectName("label_2") | ||
|
||
Spam_detector.setCentralWidget(self.centralwidget) | ||
self.statusbar = QtWidgets.QStatusBar(Spam_detector) | ||
self.statusbar.setObjectName("statusbar") | ||
Spam_detector.setStatusBar(self.statusbar) | ||
|
||
self.retranslateUi(Spam_detector) | ||
QtCore.QMetaObject.connectSlotsByName(Spam_detector) | ||
|
||
def retranslateUi(self, Spam_detector): | ||
_translate = QtCore.QCoreApplication.translate | ||
Spam_detector.setWindowTitle(_translate("Spam_detector", "MainWindow")) | ||
self.check_button.setText(_translate("Spam_detector", "Check ")) | ||
self.label.setText(_translate("Spam_detector", "<html><head/><body><p><span style=\" font-size:10pt;\">URL :</span></p></body></html>")) | ||
self.label_2.setText(_translate("Spam_detector", "<html><head/><body><p align=\"center\"><span style=\" font-size:16pt;\">Spam URL Detector</span></p></body></html>")) | ||
|
||
def button_click(self): | ||
text = self.url_input.text() | ||
#print(text) | ||
obj = feature_extractor.feature_extractor(text) | ||
str1,str2 = obj.extract() | ||
|
||
self.output_text.append("{} \n{}\n\n".format(str1,str2)) | ||
|
||
|
||
#def show_output(): | ||
|
||
if __name__ == "__main__": | ||
import sys | ||
app = QtWidgets.QApplication(sys.argv) | ||
Spam_detector = QtWidgets.QMainWindow() | ||
ui = Ui_Spam_detector() | ||
ui.setupUi(Spam_detector) | ||
Spam_detector.show() | ||
sys.exit(app.exec_()) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,35 @@ | ||
import pickle | ||
import numpy,sklearn,pandas | ||
|
||
"""# save the model to disk | ||
filename = 'finalized_model.sav' | ||
pickle.dump(clf, open(filename, 'wb')) | ||
""" | ||
|
||
def predictor(splitted_data): | ||
print("/n script rf_model") | ||
# load the model from disk | ||
filename = 'finalized_model.sav' | ||
loaded_model = pickle.load(open(filename, 'rb')) | ||
print("model loaded") | ||
print(splitted_data.shape) | ||
print(list(splitted_data)) | ||
x = splitted_data.columns[3:9] | ||
preds = loaded_model.predict(splitted_data[x]) | ||
print("pridction complete") | ||
print(preds) | ||
if preds == 0: | ||
str1 = "Spoofed webpage: Yes" | ||
else: str1 = "Spoofed webpage: NO" | ||
|
||
score = loaded_model.predict_proba(splitted_data[x]) | ||
str2 = "Confidence score: "+ str(score[0][1]) | ||
|
||
return str1,str2 | ||
|
||
|
||
|
||
|
||
|
||
|
||
|