KNN算法实现:
提取文本:
import numpy as np
//提取文本
def loadDataSet(fileName):
numFeat = len(open(fileName).readline().split(','))
dataMat = []; labelMat = []
fr = open(fileName)
for line in fr.readlines():
lineArr=[]
line = line.strip()
curline = line.split(',')
for i in range(0,numFeat-1):
lineArr.append(float(curline[i]))
dataMat.append(lineArr)
labelMat.append(float(curline[-1]))
xMat = np.mat(dataMat)
return xMat,labelMat
//训练样本标准化
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
range = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals,(m,1))
normDataSet = normDataSet/np.tile(range,(m,1))
return normDataSet
//讲总样本分为训练样本和检测样本
def classifyDataSet(normDataSet,labelMat):
labDataSet = np.array(normDataSet.copy())
testDataSet = []
testResultSet = []
classfiDataSet = []
classfiResultSet = []
size = labDataSet.shape[0]
for j in range(size):
if(j%50==0):
testDataSet.append(labDataSet[j])
testResultSet.append(labelMat[j])
else:
classfiDataSet.append(labDataSet[j])
classfiResultSet.append(labelMat[j])
return classfiDataSet,classfiResultSet,testDataSet,testResultSet
KNN
这里进入的是两个数组,不是矩阵
import numpy as np
import operator as op
def classify(inX, dataSet, labels,k=7):
dataSetSize = dataSet.shape[0]
diffMat = np.tile(inX,(dataSetSize,1))-dataSet
sqDiffMat = diffMat**2
sqlDistances = sqDiffMat.sum(axis=1)
distances = sqlDistances**0.5
sortedDistIndices = distances.argsort()
classCount = {}
for i in range(k):
voteLabel = labels[sortedDistIndices[i]]
classCount[voteLabel] = classCount.get(voteLabel,0)+1
sortedResult = sorted(classCount.iteritems(),key=op.itemgetter(1),reverse=True)
return sortedResult[0][0]
main函数
import KNN
import fileOp
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()
ax = fig.add_subplot(111)
dataMat,labelsMat = fileOp.loadDataSet('donate.txt')
normDataSet = fileOp.autoNorm(dataMat)
result = []
classfiDataSet,classfiResultSet,testDataSet,testResultSet = fileOp.classifyDataSet(normDataSet,labelsMat)
testDataSet = np.array(testDataSet)
classfiDataSet = np.array(classfiDataSet)
for i in range(testDataSet.shape[0]):
result.append(KNN.classify(testDataSet[i,:],classfiDataSet,classfiResultSet,5))
print result
print testResultSet
ax.scatter(normDataSet[:,2],normDataSet[:,3],15.0*(np.array(labelsMat)+1),15.0*(np.array(labelsMat)+1))
plt.show()
注意:
序列可以增加或减小,无shape操作
数组有shape,转置等操作,是基于某个轴进行操作的。数组有切片功能,一般用数据操作即可,矩阵用于运算。
np.dot(arr.T,arr)可以用于计算内积
numpy
array和matrix之间的区别:参考http://www.aichengxu.com/view/12902