机器学习实战(一) K-近邻

K临近算法概述

简单地说,k临近算法就是采用不同的特征值之间的距离方法进行分类.
通过数据与数据集间的距离进行分类,以及断定新数据的类别.

这里我们选择使用欧氏距离来当做两点间的距离.

实现KNN算法

伪码

对未知类别属性的数据集中的每个点依次执行以下操作

计算已知类别数据集中的点

按照距离递增次序排序

选取与当前点距离最小的k个点

确定前k个点所在的类别的出现频率

返回前k个点出现频率最高的类别作为当前点的预测分类

实现算法前

我们来学习一下需要用到的一些库函数.

numpy

1.list转array

from numpy import *
array([1,1])

2.zeros()初始化向量

import numpy
from numpy import *
a=(3,4)
zeros(a)
# 初始化一个3行四列的0矩阵

3.矩阵操作

import numpy
from numpy import *

Mat = array([[1,2],[3,4]])

# 每行最小
Mat.min(0)
# 每列最小
Mat.min(1)
# 每行和
Mat.sum(0)
# 上面传递的参数都是axis=1 or 0,0代表行,1代表列

# shape返回一个tuple,代表矩阵的行数和列数
Mat.shape

3.1矩阵排序argsort()

import numpy
from numpy import *

k = array([1,2,8.5,-1,0])
t = k.argsort()
# 输出升序排序后每位数字的下标数组

输出升序排序后每位数字的下标数组,比如上面那个输出是:

array([3,4,0,1,2],dtype=int64)
# 第一个是k[3],第二个是k[4]

4.tile

import numpy
from numpy import *

# 有两个参数,第一个参数是初始矩阵,第二个参数是一个tuple,代表
# 向行拓展次数,以及向列拓展次数,具体调用一下就知道了
tile([1,2],(1))# 原矩阵
tile([1,2],(2,2))# 行两倍,列两倍

5.运算
直接使用运算符号,是相当于每行与每列进行运算.
真正的矩阵运算需要通过库来实现.

数据读取

与本例相关的数据集地址:
datingTestSet2.txt

# 打开数据文件
fr = open('datingTestSet2.txt')
# 按行读取
arrayOfLines = fr.readlines()
arrayOfLines

from numpy import *
numberOfLines = len(arrayOfLines)
# 生成与数据集相同列数的矩阵
returnMat = zeros((numberOfLines,3))
returnMat
# 格式化读入,存储到矩阵中
for line in arrayOfLines:
    line = line.strip()
    print(line.split('\t'))
    print(int(line.split('\t')[-1]))

matplotlib散点图

import matplotlib
import matplotlib.pyplot as plt
import numpy
from numpy import *
# 生成plt
fig = plt.figure()
# 规定最多111个点
ax = fig.add_subplot(111)
# 创建一个矩阵,第三个代表类别
Mat = array([[1,123,2],[10,256,1],[7,321,3]])
# 获取类别矩阵
Label = Mat[:,2]
# 第一个参数横坐标,第二个参数纵坐标,第三个参数,颜色矩阵,第三个参数,大小矩阵
ax.scatter(Mat[:,0],Mat[:,1],15.0*Label,15.0*Label)
# 绘制
plt.show()

代码实现

对于代码的解释我都注释在代码中了

# K-近邻
'''
算法思想:

计算已知类别数据集中的点

按照距离递增次序排序

选取与当前点距离最小的k个点

确定前k个点所在的类别的出现频率

返回前k个点出现频率最高的类别作为当前点的预测分类
'''

def classify0(inX,dataSet,labels,k):
    '''
    k-邻近算法
    inX:测试数据 - array
    dataSet:样本数据集 - array
    labels:标签向量 - array
    k: 选举前k个 - int
    '''
    # 获取数据集的列数
    dataSetSize = dataSet.shape[0]
    # 新建一个矩阵,将测试数据inX复制到每列上,以便计算距离
    diffMat = tile(inX,(dataSetSize,1)) - dataSet
    # 对每个指标的距离进行平方
    sqDiffMat = diffMat**2
    # 把每个指标的差方相加
    sqDistance = sqDiffMat.sum(0)
    # 计算inX与每个点的距离
    distance = sqDistance**0.5
    # 升序排序,返回排序后的下标矩阵
    sortedDistIndicies = distance.argsort()

    # 选择距离最小的k个点
    classCount = {}
    for i in range(k):
        # 选取前k个距离最近的点中的第i个
        voteIlabel = labels[sortedDistIndicies[i]]
        # 映射到dict中,其中get的第二个参数是如果不存在的默认值
        classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
    # dict.items()返回一个字典列表(dict_items)类型,即dict的原始插入顺序的list
    # 可以直接用sorted排序
    # 升序,其中operator.itemgetter(index)代表按照待排列表的第几个元素排序.
    # reverse=True即变成了降序
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    # 返回分类中频率最高的那个的标签
    return sortedClassCount[0][0]

可视化分析

import numpy
from numpy import *
# 将测试数据转换为需要的类型
def file2matrix(filename):
    '''
    对于datingTestSet2.txt返回值类型
    returnMat: [里程数,百分比,公升数]
    --每年获得的飞行常客里程数
    --玩视频游戏所耗时间百分比
    --每周消耗的冰淇淋公升数
    classLabelVector: [标签]
    --1,2,3分别代表最好,其次,最次
    '''
    fr = open(filename)
    arrayOLines = fr.readlines()
    # 得到文件行数
    numberOfLines = len(arrayOLines)
    # 新建(文件行数,3列)的0 array
    returnMat = zeros((numberOfLines,3))
    classLabelVector = []
    index = 0
    # 处理数据
    for line in arrayOLines:
        line = line.strip()
        listFromLine = line.split('\t')
        # 将数据加入返回的列表中
        returnMat[index,:] = listFromLine[:3]
        # 标签列表
        classLabelVector.append(int(listFromLine[-1]))
        index+=1
    return returnMat,classLabelVector

import matplotlib
import matplotlib.pyplot as plt

datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')

fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()

最后的结果如下:

归一化数值

我们可以发现,在数据集中,每种类的数据极差差距都很大,比如飞行常客里程数的极差,和每周消费冰淇淋公升数的极差相距交大.

所以我们尝试将不同的数据集按照相同的区间范围进行计算.

计算公式(和百分制化为150分制的道理一样):
newValue = (OldValue-min)/(max-min)
其中min和max代表数据集中的最小特征值和最大特征值.
是用这个公式后的数值将统一变成0~1或者-1~1之间.

代码

def autoNorm(dataSet):
    '''
    归一化数值
    返回值
    normDataSet:归一化后数值 - array
    ranges:每类特征极差 - array
    minVals:每类特征最小值 - array
    '''
    # numpy数组 .min(0)每列最小值
    # .min(1)每行最小值
    minVals = dataSet.min(0)
    maxVals = dataSet.max(0)
    ranges = maxVals - minVals
    normDataSet = zeros(shape(dataSet))
    m = dataSet.shape[0]
    normDataSet = dataSet - tile(minVals,(m,1))
    normDataSet = normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals

对约会网站的测试

最后我们对之前的datingTestSet2.txt进行误差测试
其中hoRatio代表对数据集的测试普及率.
这里用0.1即1000*0.1=100个样本数据进行测试.

def datingClassTest():
    hoRatio = 0.10
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')
    normMat,ranges,minVals = autoNorm(datingDataMat)
    m = normMat.shape[0]
    numTestVecs = int(m*hoRatio)
    errorCount = 0.0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                    datingLabels[numTestVecs:m],3)
        print('the classifier came back with: %d,the real answer is: %d'\
              % (classifierResult,datingLabels[i]))
        if(classifierResult != datingLabels[i]): errorCount += 1.0
    print('the total error rate is: %f' % (errorCount/float(numTestVecs)))

datingClassTest()

测试结果:

the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 2,the real answer is: 1
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 3,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 2,the real answer is: 2
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 3,the real answer is: 2
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 2,the real answer is: 1
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 2,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 3,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 2,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 3,the real answer is: 3
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 2,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 2,the real answer is: 1
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 1,the real answer is: 1
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 3
the classifier came back with: 1,the real answer is: 2
the classifier came back with: 3,the real answer is: 1
the classifier came back with: 1,the real answer is: 1
the total error rate is: 0.600000

Done,And thank you for watching!

python 机器学习 科学计算库

Jupyter Notebook

math Last Checkpoint: a few seconds ago (autosaved) [Python 3]

Python 3

Code:

import numpy as np

np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

a=np.arange(10)

#可以直接对数组进行运算

a = a ** 2

a

array([ 0,  1,  4,  9, 16, 25, 36, 49, 64, 81], dtype=int32)

​

#Scipy

#用来做高等数学等计算的包

from scipy import linalg

#生成一个二维数组

A = np.array([[1,2],[3,4]])

A

array([[1, 2],
       [3, 4]])

#计算行列式的值

#1*4-2*3

linalg.det(A)

​

-2.0

#Pandas

#是一种构建于Numpy的高级数据结构和精巧工具,快速简单的处理数据

import pandas as pd

#序列

s = pd.Series([1,3,5,np.nan,6,8])

s

0    1.0
1    3.0
2    5.0
3    NaN
4    6.0
5    8.0
dtype: float64

#时间数组,起始时间到六天

dates = pd.date_range('20130101',periods=6)

dates

DatetimeIndex(['2013-01-01', '2013-01-02', '2013-01-03', '2013-01-04',
               '2013-01-05', '2013-01-06'],
              dtype='datetime64[ns]', freq='D')

#生成表格

#index:行标识

#columns:列标识

#rand是0-1的均匀分布,randn是均值为0方差为1的正态分布;

#rand(n)或randn(n)生成n*n的随机数矩阵。

#rand(n,m)或randn(m,n)生成m*n的随机数矩阵。

df = pd.DataFrame(np.random.randn(6,4),index=dates,columns=list('ABCD'))

df

    A   B   C   D
2013-01-01  1.210884    0.616424    0.961066    0.173936
2013-01-02  0.358245    0.506724    -0.047834   0.587061
2013-01-03  -0.508396   0.012049    -0.114224   -1.195929
2013-01-04  2.303441    0.536666    -1.013810   -0.574154
2013-01-05  -1.327828   -0.003089   0.662432    0.038886
2013-01-06  1.379826    1.554135    -0.681174   -0.816094

#通过B列降序排序

df.sort_values(by='B')

#从上到下多少行

#df.head()

#从下到上多少行

#df.tail()

#所有值和描述

#df.describe()

#转置

#df.T

​

    A   B   C   D
2013-01-01  1.210884    0.616424    0.961066    0.173936
2013-01-02  0.358245    0.506724    -0.047834   0.587061
2013-01-03  -0.508396   0.012049    -0.114224   -1.195929
2013-01-04  2.303441    0.536666    -1.013810   -0.574154
2013-01-05  -1.327828   -0.003089   0.662432    0.038886

#绘图

import matplotlib.pyplot as plt

plt.plot([1,2,3])

plt.ylabel('some numbers')

plt.show()

​