diff --git a/Spam Classification.ipynb b/Spam Classification.ipynb new file mode 100644 index 0000000..09c79f9 --- /dev/null +++ b/Spam Classification.ipynb @@ -0,0 +1,150 @@ +{ + "metadata": { + "name": "", + "signature": "sha256:bd2eaf7eb0bd39236257fcccb3eac699669f9a12530cdc65325c8a1f7dfaede9" + }, + "nbformat": 3, + "nbformat_minor": 0, + "worksheets": [ + { + "cells": [ + { + "cell_type": "code", + "collapsed": false, + "input": [ + "import pandas as pd" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 1 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_base = pd.read_csv(\"spambase.data\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 7 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_target = spam_base.pop(\"1\")" + ], + "language": "python", + "metadata": {}, + "outputs": [], + "prompt_number": 18 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "from sklearn.cross_validation import train_test_split\n", + "X_train, X_test, y_train, y_test = train_test_split(spam_base, spam_target, \n", + " test_size=0.4, random_state=0)\n", + "from sklearn.tree import DecisionTreeClassifier\n", + "classifier = DecisionTreeClassifier(max_depth = 9)\n", + "classifier = classifier.fit(X_train, y_train)\n", + "predicted = classifier.predict(X_test)\n", + "\n", + "from sklearn import metrics\n", + "print(metrics.classification_report(y_test, predicted))\n", + "print(metrics.confusion_matrix(y_test, predicted))\n", + "print(metrics.f1_score(y_test, predicted))\n", + "\n", + "from sklearn.cross_validation import cross_val_score\n", + "\n", + "scores = cross_val_score(classifier, spam_base, spam_target, cv=5)" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "output_type": "stream", + "stream": "stdout", + "text": [ + " precision recall f1-score support\n", + "\n", + " 0 0.92 0.94 0.93 1097\n", + " 1 0.91 0.88 0.89 743\n", + "\n", + "avg / total 0.92 0.92 0.92 1840\n", + "\n", + "[[1030 67]\n", + " [ 89 654]]\n", + "0.893442622951\n" + ] + } + ], + "prompt_number": 29 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [ + "spam_target" + ], + "language": "python", + "metadata": {}, + "outputs": [ + { + "metadata": {}, + "output_type": "pyout", + "prompt_number": 23, + "text": [ + "0 1\n", + "1 1\n", + "2 1\n", + "3 1\n", + "4 1\n", + "5 1\n", + "6 1\n", + "7 1\n", + "8 1\n", + "9 1\n", + "10 1\n", + "11 1\n", + "12 1\n", + "13 1\n", + "14 1\n", + "...\n", + "4585 0\n", + "4586 0\n", + "4587 0\n", + "4588 0\n", + "4589 0\n", + "4590 0\n", + "4591 0\n", + "4592 0\n", + "4593 0\n", + "4594 0\n", + "4595 0\n", + "4596 0\n", + "4597 0\n", + "4598 0\n", + "4599 0\n", + "Name: 1, Length: 4600, dtype: int64" + ] + } + ], + "prompt_number": 23 + }, + { + "cell_type": "code", + "collapsed": false, + "input": [], + "language": "python", + "metadata": {}, + "outputs": [] + } + ], + "metadata": {} + } + ] +} \ No newline at end of file