机器学习实战——第二章之改进约会网站的配对效果

三种类型:不喜欢的-1,魅力一般的-2,极具魅力的-3。

样本特征:每年获得的飞行常客里程数,玩视频游戏所耗时间百分比,每周消费的冰淇淋公升数。

 

 1 from numpy import *
 2 import matplotlib
 3 import matplotlib.pyplot as plt
 4 
 5 ‘‘‘
 6 输入文本文件名字符串,输出训练样本矩阵和类标签向量
 7 ‘‘‘
 8 def file2matrix(filename):
 9     fr = open(filename)
10     arrayOLines = fr.readlines()  #一次读取整个文件,将文件内容分析成一个行的列表
11     numberOfLines = len(arrayOLines)
12     returnMat = zeros((numberOfLines, 3))
13     classLabelVector = []
14     index = 0
15     for line in arrayOLines:
16         line = line.strip()        #截取掉所有的回车字符
17         listFromLine = line.split(\t)                #将line分割成一个元素列表
18         returnMat[index, :] = listFromLine[0:3]        #选取前3个元素存储到特征矩阵中
19         classLabelVector.append(int(listFromLine[-1]))        #选取最后一个元素进行存储
20         index += 1
21     return returnMat, classLabelVector
22 
23 if __name__ == __main__:
24     datingDataMat, datingLabels = file2matrix(datingTestSet2.txt)
25     fig = plt.figure()
26     ax = fig.add_subplot(111)
27     ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 20.0*array(datingLabels), 15.0*array(datingLabels))
28     plt.show()

 技术分享

 


>>> import numpy as np
>>> np.zeros((3,2))  #3行2列的零矩阵
array([[ 0.,  0.],
       [ 0.,  0.],
       [ 0.,  0.]])
ax.scatter(datingDataMat[:,0],datingDataMat[:,1], 20.0*array(datingLabels), 15.0*array(datingLabels))#scatter(x,y,大小,颜色)

 

‘‘‘
归一化特征值
newValue = (oldValue - min) / (max - min)
‘‘‘
def autoNorm(dataSet):
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals, (m, 1))    #用tile函数将变量内容复制成输入矩阵同样大小的额矩阵
    normDataSet = normDataSet / tile(ranges, (m, 1))
    return normDataSet, ranges, minVals

if __name__ == __main__:
    datingDataMat, datingLabels = file2matrix(datingTestSet2.txt)
    normMat, ranges, minVals = autoNorm(datingDataMat)
    fig = plt.figure()
    ax = fig.add_subplot(111)
    ax.scatter(normMat[:,0],normMat[:,1], 30.0*array(datingLabels), 100.0*array(datingLabels))
    plt.xlabel(u每年获得的飞行常客里程数)
    plt.ylabel(u玩视频游戏所耗时间百分比)
    plt.show()

技术分享


 1 ‘‘‘
 2 测试代码。
 3 评估算法的正确率:提供已有样本的90%作为训练样本,而使用其余的10%数据去测试分类器。
 4 错误率 = 错误结果的次数 / 测试数据的总数
 5 ‘‘‘
 6 def datingClassTest():
 7     hoRatio = 0.10
 8     datingDataMat, datingLabels = file2matrix(datingTestSet2.txt)
 9     normMat, ranges, minVals = autoNorm(datingDataMat)
10     m = normMat.shape[0]
11     numTestVecs = int(m * hoRatio)        #10%的样本数用于测试
12     errorCount = 0    
13     for i in range(numTestVecs):
14         classifierResult = classify0(normMat[i,:], normMat[numTestVecs:m,:], 15             datingLabels[numTestVecs:m], 20)
16         print "the classifier came back with: %d, the real answer is: %d" 17             % (classifierResult, datingLabels[i])
18         if (classifierResult != datingLabels[i]): errorCount += 1.0
19     print "the total error rate is: %f" % (errorCount / float(numTestVecs))  #输出错误率
20 
21 if __name__ == __main__:
22     datingClassTest()

 

 1 ‘‘‘
 2 输入某个人的信息,给出对对方喜欢程度的预测值
 3 ‘‘‘
 4 def classifyPerson():
 5     resultList = [not at all, in small doses, in large doses]
 6     percentTats = float(raw_input("percentage of time spend playing video games?"))
 7     ffMiles = float(raw_input("frequent flier miles earned per year?"))
 8     iceCream = float(raw_input("liters of ice cream consumed per year?"))
 9     datingDataMat, datingLabels = file2matrix(datingTestSet2.txt)
10     normMat, ranges, minVals = autoNorm(datingDataMat)
11     inArr = array([ffMiles, percentTats, iceCream])
12     classifierResult = classify0((inArr - minVals) / ranges, normMat, datingLabels, 3)
13     print "You will probably like this person: ", resultList[classifierResult - 1]
14 
15 if __name__ == __main__:
16     classifyPerson()

测试:

1 percentage of time spend playing video games?10
2 frequent flier miles earned per year?10000
3 liters of ice cream consumed per year?0.5
4 You will probably like this person:  in small doses

 

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。