diff --git a/Features rilevanti con RFE.ipynb b/Features rilevanti con RFE.ipynb new file mode 100644 index 0000000..1a36a3c --- /dev/null +++ b/Features rilevanti con RFE.ipynb @@ -0,0 +1,195 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "\n", + "from sklearn import datasets\n", + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.svm import SVR\n", + "from sklearn.preprocessing import normalize\n", + "from sklearn.metrics import accuracy_score\n", + "from sklearn.model_selection import StratifiedKFold\n", + "from sklearn.feature_selection import RFE\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "40001\n", + "100001\n", + "1\n", + "40002\n", + "100002\n", + "2\n", + "40003\n", + "100003\n", + "3\n", + "40004\n", + "100004\n", + "4\n", + "40005\n", + "100005\n", + "5\n", + "40006\n", + "100006\n", + "6\n", + "40007\n", + "100007\n", + "7\n", + "40008\n", + "100008\n", + "8\n", + "40009\n", + "100009\n", + "9\n", + "40010\n", + "100010\n", + "10\n", + "\n", + "CAT.Fever 10\n", + "CAT.Cough 10\n", + "CAT.Dyspnea 0\n", + "CAT.IR 0\n", + "CAT.Myalgias 0\n", + "CAT.Other 10\n", + "CAT.Syncope 10\n", + "CAT.Asthenia 10\n", + "CAT.Vomiting.Nausea 0\n", + "CAT.Diarrhea 10\n", + "CAT.Headache 0\n", + "CAT.Pharingeal.pain 10\n", + "INT.No.Symptoms 10\n", + "CAT.Pneumo.asthma 10\n", + "CAT.Pneumo.BPCO 0\n", + "CAT.Neoplasia.last.5.years 10\n", + "CAT.Smoke 0\n", + "CAT.Arterial.hypertension 0\n", + "CAT.Cardiovascular.pathologies 10\n", + "CAT.Diabetes 10\n", + "CAT.Obesity 10\n", + "CAT.Celebral.stroke 10\n", + "INT.No.Comorbidities 10\n", + "CAT.Sex 0\n", + "INT.Age 0\n", + "INT.Symptoms.No.days 0\n", + "INT.usa.radio.score.MAX 10\n", + "INT.radio.SCORE 0\n", + "NUM.GEO.extent.score 0\n", + "NUM.OPC.extent.score 10\n", + "INT.PaO2.PF 0\n", + "INT.SpO2.in.FA 10\n", + "INT.ALT 0\n", + "INT.Platelets 0\n", + "NUM.White.blood.cells 0\n", + "NUM.Red.blood.cells 10\n", + "NUM.Lymphocyte 0\n", + "NUM.perc.Lymphocyte 0\n", + "NUM.CRP 0\n", + "NUM.Haemoglobin 10\n", + "NUM.Haematocrit 0\n" + ] + } + ], + "source": [ + "#carico il dataset\n", + "data = pd.read_csv(r'C:\\Users\\Utente\\anaconda3\\Lib\\site-packages\\pandas\\io\\data_covnet_score-imputed_missRF_increasing_1.txt')\n", + "\n", + "#creo un array che mi serve per salvare le informazioni delle features di volta in volta.\n", + "parzial_features=list()\n", + "\n", + "#Seleziono tutte le colonne tranne la prima.\n", + "features = [f for f in data.columns if f not in ['LABEL']]\n", + "X = data[features].values\n", + "y = data['LABEL'].values.ravel()\n", + "\n", + "#creo un array monodimensionale lungo quanto \"altri\" ma solo di 0. \n", + "y_pred=y-y\n", + "\n", + "#classificatore\n", + "estimator = SVR(kernel=\"linear\")\n", + "selector = RFE(estimator, n_features_to_select=20, step=1)\n", + "#le tre variabili sotto sono create per capire a che punto รจ l'esecuzione del programma, in quanto molto lento.\n", + "a=0\n", + "b=100000\n", + "c=40000\n", + "#crea un oggetto pronto ad operare: quando gli arriva in input qualcosa lo divide in 10 pezzettini con la stessa proporzione\n", + "skf = StratifiedKFold(n_splits=10)\n", + "\n", + "for train_index, test_index in skf.split(X, y):\n", + " c=c+1\n", + " print(c)\n", + "\n", + " X_train, X_test = X[train_index,:], X[test_index,:]\n", + " y_train, y_test = y[train_index], y[test_index]\n", + " \n", + " selector=selector.fit(X, y)\n", + " \n", + " b=b+1\n", + " print(b)\n", + " \n", + " #salvo tutte le volte i risultati ottenuti in un array.\n", + " indexes = np.where(selector.support_ == True)\n", + " for x in np.nditer(indexes):\n", + " parzial_features.append(features[x])\n", + " \n", + " a=a+1\n", + " print(a)\n", + "\n", + "\n", + " \n", + " #clf = rbf_svm.fit(X_train, y_train)\n", + " #y_pred[test_index] = clf.predict(X_test)\n", + " \n", + " \n", + " #fine ciclo for\n", + "#selector.support_\n", + "\n", + "print()\n", + "for i in features:\n", + " c=parzial_features.count(i)\n", + " print(i,' ', c)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}