使用Python第三方库numpy和pandas,通过分箱法进行噪声数据。
import pandas as pd
import numpy as np
income=np.array([800, 1000, 1200, 1500,1500,1800,2000,2300,2500,2800,3000,3500,4000,4500,4800,5000])
colBin = pd.cut(income,bins=4)
print(pd.value_counts(colBin,sort=False))
print(colBin)
def binning(col, cut_points, labels=None):
minval = col.min()
maxval = col.max()
# 利用最大值和最小值创建分箱端点值
break_points = [minval] + cut_points + [maxval]
# 如果没有标签,则使用默认标签0,1,2,...,(n-1)
if not labels:
labels = range(len(cut_points) + 1)
colBin = pd.cut(col,bins=break_points,labels=labels,right=True,include_lowest=True)
return colBin
cut_points = [1000, 2500, 4500]
labels = ["low", "medium", "high", "very high"]
income_box = binning(income, cut_points, labels)
print(pd.value_counts(income_box, sort=False))
print(income_box)