Sklearn-SVM-Breast Cancer Wisconsin (Diagnostic) Data Set

Sklearn的SVM算法是基于LIBSVM实现的,本文使用SVC模型对Wisconsin Diagnostic Breast Cancer数据进行分类预测。

import pandas as pd
from pandas import set_option
from sklearn import svm
from pandas import read_csv
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.svm import SVC
from sklearn import preprocessing
from sklearn.model_selection import KFold
import matplotlib.pyplot as plt

#设置显示选项
desired_width = 480
pd.set_option('display.width', desired_width)

###数据URL地址
data_url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data'

###数据列名 se = standard error 具体数据信息请参见https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.names
column_name = ['id', 'diagnosis', 'radius_mean', 'texture_mean', 'perimeter_mean', 'area_mean', 'smoothness_mean', 'compactness_mean', 'concavity_mean', \
               'concave points_mean', 'symmetry_mean', 'fractal_dimension_mean', 'radius_se', 'texture_se', 'perimeter_se', 'area_se', 'smoothness_se', \
               'compactness_se', 'concavity_se', 'concave points_se', 'symmetry_se', 'fractal_dimension_se', 'radius_worst', 'texture_worst', 'perimeter_worst', \
               'area_worst', 'smoothness_worst', 'compactness_worst', 'concavity_worst', 'concave points_worst', 'symmetry_worst', 'fractal_dimension_worst']

df = pd.read_csv(data_url, names=column_name)

print(df.dtypes)
print()
print("Wisconsin Diagnostic Breast Cancer DataSets'Shape Informantion: ")
print(df.shape)
print()
print("Column_Name diagnosis Class Information: ")
print(df.groupby('diagnosis').size())
print()
print(df.head(5))

###删除ID NUMBER列,将diagnosis列的Malignant和Benign分别映射为1和0
dummy_fields = ['id']
df = df.drop(dummy_fields, axis=1)
df['diagnosis'] = df['diagnosis'].map({'M': 1, 'B': 0})

###数据柱状图
df.hist(sharex=False, sharey=False, xlabelsize=1, ylabelsize=1)
#plt.show()

data_df = list(df.columns[1:31])

X = df.loc[:, data_df]
y = df['diagnosis']

# Test options and evaluation metric
num_folds = 10
seed = 7
scoring = 'accuracy'

###模型详细信息
svm_model = SVC()
# c_values = [1, 10, 100, 1000]
# kernel_values = ['linear']
# param_grid = dict(C=c_values, kernel=kernel_values)

param_grid = [{'kernel': ['rbf'],
               'gamma': [1e-4, 1e-3, 0.01, 0.1, 0.2, 0.5],
               'C': [1, 10, 100, 1000]},
              {'kernel': ['linear'], 'C': [1, 10, 100, 1000]}]

kfold = KFold(n_splits=num_folds, random_state=seed)
grid = GridSearchCV(estimator=svm_model, param_grid=param_grid, scoring=scoring, cv=kfold)
grid_result = grid.fit(X, y)
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
print()
print("Grid scores on training set:")
print()
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

数据柱状图如下
BreastCancerWisconsinDataSetHistogram

输出

dtype:
id                           int64
diagnosis                   object
radius_mean                float64
texture_mean               float64
perimeter_mean             float64
area_mean                  float64
smoothness_mean            float64
compactness_mean           float64
concavity_mean             float64
concave points_mean        float64
symmetry_mean              float64
fractal_dimension_mean     float64
radius_se                  float64
texture_se                 float64
perimeter_se               float64
area_se                    float64
smoothness_se              float64
compactness_se             float64
concavity_se               float64
concave points_se          float64
symmetry_se                float64
fractal_dimension_se       float64
radius_worst               float64
texture_worst              float64
perimeter_worst            float64
area_worst                 float64
smoothness_worst           float64
compactness_worst          float64
concavity_worst            float64
concave points_worst       float64
symmetry_worst             float64
fractal_dimension_worst    float64
dtype: object

Wisconsin Diagnostic Breast Cancer DataSets'Shape Informantion: 
(569, 32)

Column_Name diagnosis Class Information: 
diagnosis
B    357
M    212
dtype: int64

         id diagnosis  radius_mean  texture_mean  perimeter_mean  area_mean  smoothness_mean  compactness_mean  concavity_mean  concave points_mean  radius_worst  texture_worst  perimeter_worst  area_worst  smoothness_worst  compactness_worst  concavity_worst  concave points_worst  symmetry_worst  fractal_dimension_worst
0    842302         M        17.99         10.38          122.80     1001.0          0.11840           0.27760          0.3001              0.14710         25.38          17.33           184.60      2019.0            0.1622             0.6656           0.7119                0.2654          0.4601                  0.11890
1    842517         M        20.57         17.77          132.90     1326.0          0.08474           0.07864          0.0869              0.07017         24.99          23.41           158.80      1956.0            0.1238             0.1866           0.2416                0.1860          0.2750                  0.08902
2  84300903         M        19.69         21.25          130.00     1203.0          0.10960           0.15990          0.1974              0.12790         23.57          25.53           152.50      1709.0            0.1444             0.4245           0.4504                0.2430          0.3613                  0.08758
3  84348301         M        11.42         20.38           77.58      386.1          0.14250           0.28390          0.2414              0.10520         14.91          26.50            98.87       567.7            0.2098             0.8663           0.6869                0.2575          0.6638                  0.17300
4  84358402         M        20.29         14.34          135.10     1297.0          0.10030           0.13280          0.1980              0.10430         22.54          16.67           152.20      1575.0            0.1374             0.2050           0.4000                0.1625          0.2364                  0.07678

[5 rows x 32 columns]

Best: 0.959578 using {'C': 100, 'kernel': 'linear'}

Grid scores on training set:

0.933216 (0.044219) with: {'C': 1, 'gamma': 0.0001, 'kernel': 'rbf'}
0.917399 (0.029568) with: {'C': 1, 'gamma': 0.001, 'kernel': 'rbf'}
0.630931 (0.176754) with: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1, 'gamma': 0.1, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1, 'gamma': 0.2, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1, 'gamma': 0.5, 'kernel': 'rbf'}
0.931459 (0.035519) with: {'C': 10, 'gamma': 0.0001, 'kernel': 'rbf'}
0.906854 (0.027474) with: {'C': 10, 'gamma': 0.001, 'kernel': 'rbf'}
0.632689 (0.172067) with: {'C': 10, 'gamma': 0.01, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 10, 'gamma': 0.1, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 10, 'gamma': 0.2, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 10, 'gamma': 0.5, 'kernel': 'rbf'}
0.942004 (0.026254) with: {'C': 100, 'gamma': 0.0001, 'kernel': 'rbf'}
0.906854 (0.027474) with: {'C': 100, 'gamma': 0.001, 'kernel': 'rbf'}
0.632689 (0.172067) with: {'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 100, 'gamma': 0.1, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 100, 'gamma': 0.2, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 100, 'gamma': 0.5, 'kernel': 'rbf'}
0.929701 (0.023666) with: {'C': 1000, 'gamma': 0.0001, 'kernel': 'rbf'}
0.906854 (0.027474) with: {'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
0.632689 (0.172067) with: {'C': 1000, 'gamma': 0.01, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1000, 'gamma': 0.1, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1000, 'gamma': 0.2, 'kernel': 'rbf'}
0.627417 (0.177054) with: {'C': 1000, 'gamma': 0.5, 'kernel': 'rbf'}
0.952548 (0.019270) with: {'C': 1, 'kernel': 'linear'}
0.952548 (0.017597) with: {'C': 10, 'kernel': 'linear'}
0.959578 (0.015782) with: {'C': 100, 'kernel': 'linear'}
0.957821 (0.019530) with: {'C': 1000, 'kernel': 'linear'}

Process finished with exit code 0

 

发表在 Data Mining, Machine Learning, Python | 留下评论

Oracle 12cR2-Distributed LOBs

本文测试了Oracle 12cR2的Distributed LOB基本的功能。

数据库使用的版本为12.2.0.1,在两个pdb之间进行测试。

-----BLUE SERVICE下
连接到:
Oracle Database 12c Enterprise Edition Release 12.2.0.1.0 - 64bit Production


USERNAME     INST_NAME   HOST_NAME    SID  SERIAL# VERSION    STARTED  SPID   OPID  CPID         SADDR            PADDR
----------- ----------- ----------- ----- -------- ---------- -------- ------ ----- ----------- ---------------- ----------------
SH           RED-ywdb    rhel7        86   14250   12.2.0.1.0 20170824 8867   8     5324:11336  000000007DE75430 000000007DC8DE38

blue下创建含有LOB字段的对象
drop table test_lob purge;
create table test_lob(id number,description clob);
insert into test_lob values (1,rpad('a',5000,'a'));
commit;



---RED SERVICE下
SQL> create public database link red_to_blue connect to sh identified by sh using '(DESCRIPTION =
  2      (ADDRESS_LIST =
  3        (ADDRESS = (PROTOCOL = TCP)(HOST = 192.168.2.55)(PORT = 1521))
  4      )
  5      (CONNECT_DATA =
  6        (SERVICE_NAME = blue)
  7      )
  8    )';

数据库链接已创建。

SQL> SELECT DBMS_LOB.GETLENGTH@red_to_blue(description)  FROM test_lob@red_to_blue;

DBMS_LOB.GETLENGTH@RED_TO_BLUE(DESCRIPTION)
-------------------------------------------
                                       4000

SQL> SELECT LENGTH(description) FROM test_lob@red_to_blue;

LENGTH(DESCRIPTION)
-------------------
               4000

SQL>

SQL> SELECT DBMS_LOB.SUBSTR(description) from test_lob@red_to_blue;
SELECT DBMS_LOB.SUBSTR(description) from test_lob@red_to_blue
                       *
第 1 行出现错误:
ORA-22992: 无法使用从远程表选择的 LOB 定位符

SQL> SELECT DBMS_LOB.SUBSTR@red_to_blue(description) from test_lob@red_to_blue;

DBMS_LOB.SUBSTR@RED_TO_BLUE(DESCRIPTION)
-------------------------------------------------------------------------------------------------------------------------------
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa
SQL> col DESCRIPTION for a10

SQL> update test_lob@red_to_blue t set t.description = 'abc'; --更新

已更新 1 行。


SQL> select * from test_lob@red_to_blue;

        ID DESCRIPTIO
---------- ----------
         1 abc


SQL> insert into test_lob@red_to_blue(id,description) values(2,'leo');--写入

已创建 1 行。

SQL> select * from test_lob@red_to_blue;

        ID DESCRIPTIO
---------- ----------
         1 abc
         2 leo

SQL> commit;

提交完成。

PL/SQL使用Remote Locator操作LOB

set serveroutput on

SQL> set serveroutput on
SQL>
SQL> DECLARE
  2    avc_str        varchar2(4000);
  3    remote_locator CLOB;
  4  begin
  5    select description
  6      into remote_locator
  7      from test_lob@red_to_blue
  8     where id = 1;
  9    avc_str := dbms_lob.substr(remote_locator, 2, 3);
 10    dbms_output.put_line('The result is: ' || avc_str);
 11  end;
 12  /
The result is: c

PL/SQL 过程已成功完成。


DBMS_LOB使用Remote Locator操作LOB

SQL> set serveroutput on
SQL>
SQL>
SQL> DECLARE
  2    avc_lob CLOB;
  3    avc_str VARCHAR2(20) := 'sklearn';
  4    len     NUMBER(4);
  5    amt     NUMBER(4);
  6  begin
  7    amt := 7;
  8    select description
  9      into avc_lob
 10      from test_lob@red_to_blue
 11     where id = 2
 12       for update;
 13    dbms_lob.write(avc_lob, amt, 1, avc_str);
 14    amt := 10;
 15    dbms_lob.read(avc_lob, amt, 1, avc_str);
 16    len := dbms_lob.getlength(avc_lob);
 17    dbms_output.put_line(avc_str);
 18    dbms_output.put_line(amt);
 19    dbms_output.put_line('get length output = ' || len);
 20  END;
 21  /
sklearn
7
get length output = 7

PL/SQL 过程已成功完成。

SQL> commit;

提交完成。


SQL>  select * from test_lob@red_to_blue;

        ID DESCRIPTIO
---------- ----------
         1 abc
         2 sklearn


 

发表在 12c, LOB, PLSQL | 留下评论

sklearn-Convert categorical data into numerical data

本文使用sklearn提供的LabelEncoder和OneHotEncoder将categorical数据转换为numerical数据。
OneHotEncoder翻译成中文叫独热编码,通俗的讲就是:有多少个状态就有多少比特,而且只有一个比特为1,其他为0的一种编码机制。
LabelEncoder翻译为中文叫标签编码,是将categorical数据转换成连续的数据型变量,即对不连续的数字或文本进行编号。

import pandas as pd
from sklearn import preprocessing
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder

#set output format
desired_width = 480
pd.set_option('display.width', desired_width)

#initial dataset
df_train =  pd.DataFrame({'Name': ['Braund', 'Cumings', 'Futrelle', 'Allen','McCarthy','Johnson'],
                           'Sex': ['male','female','female','male','male','female'],
                           'Age':[22, 38, 35, 35,54,27]})

# choose categorical data using dataframe.select_dtypes()
df_train_cat = df_train.select_dtypes(include=[object])
print('data sets:')
print(df_train_cat)

#encode labels with values between 0 and n_class - 1
le = preprocessing.LabelEncoder()

# fit and transform
df_train_cat_le = df_train_cat.apply(le.fit_transform)
print('LabelEncoder:')
print(df_train_cat_le)

#fit transform
ohe = preprocessing.OneHotEncoder()
ohe.fit(df_train_cat_le)
ohelabels = ohe.transform(df_train_cat_le).toarray()

print('OneHotEncoder:')
print(ohelabels)

print('shape size:')
print(ohelabels.shape)

输出为:
data sets:
       Name     Sex
0    Braund    male
1   Cumings  female
2  Futrelle  female
3     Allen    male
4  McCarthy    male
5   Johnson  female
LabelEncoder:
   Name  Sex
0     1    1    
1     2    0
2     3    0
3     0    1
4     5    1
5     4    0
OneHotEncoder:
[[ 0.  1.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  1.  0.  0.  0.  1.  0.]
 [ 0.  0.  0.  1.  0.  0.  1.  0.]
 [ 1.  0.  0.  0.  0.  0.  0.  1.]
 [ 0.  0.  0.  0.  0.  1.  0.  1.]
 [ 0.  0.  0.  0.  1.  0.  1.  0.]]
shape size:
(6, 8)

Name列为第一个特征维度,有6种取值0/1/2/3/4/5,对应的编码为
0----1 0 0 0 0 0
1----0 1 0 0 0 0
2----0 0 1 0 0 0
3----0 0 0 1 0 0
4----0 0 0 0 1 0
5----0 0 0 0 0 1

Sex列为第二个特征维度,有2种取值0/1,对应的编码为
0----1 0
1----0 1


所以对于
   Name  Sex
    1    1  --编码为0 1 0 0 0 0 0 1 与红色部分相同,其他类似。
    2    0  --     0 0 1 0 0 0 1 0


发表在 Data Mining, Machine Learning, Python | 留下评论

Scikit-learn Feature Importance and Chi-squared

使用的数据为Oracle的SH schema下MINING_DATA_BUILD_V视图的数据。
Attribute Importance一文测试了Oracle对此视图数据进行Attribute Importance计算的结果。

SQL> desc MINING_DATA_BUILD_V
Name                    Type         Nullable Default Comments 
----------------------- ------------ -------- ------- -------- 
CUST_ID                 NUMBER                                 
CUST_GENDER             CHAR(1)                                
AGE                     NUMBER       Y                         
CUST_MARITAL_STATUS     VARCHAR2(20) Y                         
COUNTRY_NAME            VARCHAR2(40)                           
CUST_INCOME_LEVEL       VARCHAR2(30) Y                         
EDUCATION               VARCHAR2(21) Y                         
OCCUPATION              VARCHAR2(21) Y                         
HOUSEHOLD_SIZE          VARCHAR2(21) Y                         
YRS_RESIDENCE           NUMBER       Y                         
AFFINITY_CARD           NUMBER(10)   Y                         
BULK_PACK_DISKETTES     NUMBER(10)   Y                         
FLAT_PANEL_MONITOR      NUMBER(10)   Y                         
HOME_THEATER_PACKAGE    NUMBER(10)   Y                         
BOOKKEEPING_APPLICATION NUMBER(10)   Y                         
PRINTER_SUPPLIES        NUMBER(10)   Y                         
Y_BOX_GAMES             NUMBER(10)   Y                         
OS_DOC_SET_KANJI        NUMBER(10)   Y    

Sklearn python代码如下

# -*- coding: utf-8 -*-
"""
@author: Leo Zhang
"""
import pandas as pd
import matplotlib.pyplot as plt
from pandas import read_csv
from sklearn import preprocessing
from numpy import set_printoptions
from sklearn.feature_selection import SelectKBest,chi2
from sklearn.ensemble import ExtraTreesClassifier

##load mining_data_build.csv data
from_filename = 'f:\mining_data_build.csv'
to_filename = 'f:\mining_data_build_preprocessed.csv'
column_name = ['CUST_GENDER','AGE','CUST_MARITAL_STATUS','COUNTRY_NAME','CUST_INCOME_LEVEL','EDUCATION','OCCUPATION','HOUSEHOLD_SIZE',
               'YRS_RESIDENCE','BULK_PACK_DISKETTES','FLAT_PANEL_MONITOR','HOME_THEATER_PACKAGE','BOOKKEEPING_APPLICATION',
               'PRINTER_SUPPLIES','Y_BOX_GAMES','OS_DOC_SET_KANJI','AFFINITY_CARD']
df_mining_data_build = pd.read_csv(from_filename, engine='python')
#print(df_mining_data_build.columns)

mining_data_build_v = df_mining_data_build

for value in ['CUST_GENDER', 'CUST_MARITAL_STATUS', 'COUNTRY_NAME',
       'CUST_INCOME_LEVEL', 'EDUCATION', 'OCCUPATION','HOUSEHOLD_SIZE']:
    mining_data_build_v[value].replace(['?'], [mining_data_build_v.describe(include='all')[value][2]],inplace=True)

# cat是categorical的缩写,对字符串字段进行处理,LabelEncoder是sklearn提供的对不连续的数字或者文本进行编号。
le = preprocessing.LabelEncoder()
CUST_GENDER_CAT = le.fit_transform(df_mining_data_build.CUST_GENDER)
CUST_MARITAL_STATUS_CAT = le.fit_transform(df_mining_data_build.CUST_MARITAL_STATUS)
COUNTRY_NAME_CAT   = le.fit_transform(df_mining_data_build.COUNTRY_NAME)
CUST_INCOME_LEVEL_CAT = le.fit_transform(df_mining_data_build.CUST_INCOME_LEVEL)
EDUCATION_CAT = le.fit_transform(df_mining_data_build.EDUCATION)
OCCUPATION_CAT = le.fit_transform(df_mining_data_build.OCCUPATION)
HOUSEHOLD_SIZE_CAT = le.fit_transform(df_mining_data_build.HOUSEHOLD_SIZE)

mining_data_build_v['CUST_GENDER_CAT'] = CUST_GENDER_CAT
mining_data_build_v['CUST_MARITAL_STATUS_CAT'] = CUST_MARITAL_STATUS_CAT
mining_data_build_v['COUNTRY_NAME_CAT'] = COUNTRY_NAME_CAT
mining_data_build_v['CUST_INCOME_LEVEL_CAT'] = CUST_INCOME_LEVEL_CAT
mining_data_build_v['EDUCATION_CAT'] = EDUCATION_CAT
mining_data_build_v['OCCUPATION_CAT'] = OCCUPATION_CAT
mining_data_build_v['HOUSEHOLD_SIZE_CAT'] = HOUSEHOLD_SIZE_CAT

#从pandas DataFrame中删除categorical列
dummy_fields = ['CUST_GENDER', 'CUST_MARITAL_STATUS', 'COUNTRY_NAME',
                  'EDUCATION', 'OCCUPATION','HOUSEHOLD_SIZE']

mining_data_build_v = mining_data_build_v.drop(dummy_fields, axis = 1)
mining_data_build_v = mining_data_build_v.reindex_axis(['CUST_GENDER_CAT','AGE','CUST_MARITAL_STATUS_CAT','COUNTRY_NAME_CAT',
'CUST_INCOME_LEVEL_CAT','EDUCATION_CAT', 'OCCUPATION_CAT','HOUSEHOLD_SIZE_CAT','YRS_RESIDENCE','BULK_PACK_DISKETTES',
'FLAT_PANEL_MONITOR','HOME_THEATER_PACKAGE','BOOKKEEPING_APPLICATION', 'PRINTER_SUPPLIES',
'Y_BOX_GAMES','OS_DOC_SET_KANJI','AFFINITY_CARD'], axis = 1)

#print(mining_data_build_v.head(2))

#保存dataframe到csv文件,没有index列
#mining_data_build_v.to_csv(to_filename, sep=',',index = False, encoding='utf-8')


arr_mining_data_build = mining_data_build_v.values

print(mining_data_build_v.columns)

X = arr_mining_data_build[:,0:16]
Y = arr_mining_data_build[:,16]

#设置输出的精度
set_printoptions(precision=2,suppress=True)
###Feature Importance
model = ExtraTreesClassifier()
model.fit(X,Y)
print("Feature Importance:")
print(model.feature_importances_)


##Chi-squared for classification 卡方分布输出5个相对重要的属性分数
k_best = SelectKBest(score_func=chi2,k = 5)
fit_mining_data_build = k_best.fit(X,Y)
print("Chi-squared Column Names:")
print(fit_mining_data_build.scores_)

程序的输出结果为
Feature Importance:
[ 0.04  0.18  0.06  0.02  0.11  0.15  0.12  0.1   0.11  0.02  0.02  0.03  0.03  0.   0.02  0.  ]
红色对应的属性为: AGE,CUST_INCOME_LEVEL,EDUCATION,OCCUPATION,YRS_RESIDENCE

Chi-squared Column Names:
[  24.91  436.23    1.99    5.31    1.23   73.73    0.83    4.3   158.87    0.18    0.51   51.15    4.72    0.     84.56    1.02]
红色对应的属性为:AGE,EDUCATION,YRS_RESIDENCE,HOME_THEATER_PACKAGE,Y_BOX_GAMES
发表在 algorithm, Machine Learning, Python | 留下评论

Oracle Data Mining- Attribute Importance

在数据挖掘过程中用于训练模型的数据的特征对于模型的准确率具有很大的影响。
在数据挖掘的过程中,过多的信息会减低挖掘的有效性。一些数据属性对于模型的建立和测试并没有意义而且这些属性可能会降低模型的质量和准确率。
例如,当你有一组人的很多维度的信息,想预测这组人的某种疾病发生的可能性的时候,数据中所包含的某人家庭中有几辆汽车这样的属性信息对于预测疾病的发生是没有效果的。
不相关的属性增加了数据的噪声进而影响了模型的准确性。在模型的建立和scoring的过程中,噪声数据的存在增加了模型的大小、时间和系统资源的消耗。
数据集中的数据可能是多个属性相关联的,这些属性实际上度量了相同的潜在特征,这些属性集合增加了建模过程中的逻辑倾斜度且影响了模型的准确性。
多属性宽数据集通常要求数据挖掘算法具有强大的处理能力。处理的维度越高,算法处理过程中计算成本越高。
在数据挖掘的预处理阶段,为了最小化噪声、相关性和高维度,需要一些形式的降维处理。
特征选择和特征提取是降维的两种形式,Oracle对这两个名字的解释如下:
Feature selection(特征选择): Selecting the most relevant attributes
Feature extraction(特征抽取): Combining attributes into a new reduced set of features

Feature Selection and Attribute Importance

找到最重要的prodictor是一些数据挖掘的项目的目标,例如某个模型的目标是找到产生信用卡风险的最主要特征。Oracle的数据挖掘支持Attribute Importance挖掘算法,在预测目标的时候根据属性的重要性进行rank排序。Attribute Importance算法不进行特征选择,所有的predictors都继续保留在模型中。在真正的特征选择算法中,rank后低于某个阀值的属性会从模型中被移除掉。
特征选择在predictive modeling中作为预处理的步骤用于提升效率。当 Automatic Data Preparation开启的情况下,Oracle在决策树和朴素贝叶斯算法中实现了特征选择作为优化算法,广义线性模型(Generalized Linear Model)在预处理步骤中可以通过配置实现特征选择。
Oracle数据挖掘算法不支持在Attribute Importance中进行scoring操作。Attribute Importance的结果是根据属性的预测影响在build data中进行rank的结果, ranking和重要性的度量可以用于在分类模型中选择训练数据。
Oracle数据挖掘算法使用Minimum Description Length算法作为Attribute Importance的实现。

Feature Extraction

与特征选择不同,特征抽取是属性减少的过程,选择并保留最重要的属性。特征抽取实际上是转换了这些属性,这些被转换的属性(特征)是原始属性的线性组合。特征抽取的处理过程产生出小而丰富的属性集合,最大的特征数可以是用户指定或由算法决定,默认情况下是算法决定。
在特征抽取后的数据基础上的建立的模型质量是比较高的,因为相当于没有进行特征抽取的数据,进行过特征抽取的数据更少更有意义。
特征抽取将多维度的数据映射成小维度,使之更易于进行数据可视化。
特征抽取算法的一些应用包括潜在语义分析、数据压缩、数据分解和投影以及模式识别。特征抽取可以用于加速监督学习的速度和有效性。
特征抽取也可以被用于文档集的主题抽取,每个主题是一组词的组合。
Oracle的特征抽取挖掘算法支持scoring操作。作为非监督挖掘函数,特征抽取不需要目标值,当使用的时候,特征抽取模型转换输入为一组特征值。
Oracle数据挖掘算法包含的特征抽取算法有:
非负矩阵 Non-Negative Matrix Factorization (NMF).
奇异值分解 Singular Value Decomposition (SVD)
主成分分析 Principal Component Analysis (PCA)
显性语义分析Explicit Semantic Analysis (ESA).

对Oracle的Attribute Importance算法进行测试

测试环境为Oracle 12.2.0.1
SQL> -- Cleanup old output table for repeat runs
SQL> BEGIN EXECUTE IMMEDIATE 'DROP TABLE ai_explain_output';
  2  EXCEPTION WHEN OTHERS THEN NULL; END;
  3  /
 
PL/SQL procedure successfully completed
SQL> -------------------
SQL> -- Run the EXPLAIN routine to get attribute importance results
SQL> BEGIN
  2    DBMS_PREDICTIVE_ANALYTICS.EXPLAIN(
  3      data_table_name     => 'mining_data_build_v',
  4      explain_column_name => 'affinity_card',
  5      result_table_name   => 'ai_explain_output');
  6  END;
  7  /
 
PL/SQL procedure successfully completed
SQL> ------------------------
SQL> -- DISPLAY RESULTS
SQL> --
SQL> -- List of attribute names ranked by their importance value.
SQL> -- The larger the value, the more impact that attribute has
SQL> -- on causing variation in the target column.
SQL> --
SQL> column attribute_name    format a40
SQL> column explanatory_value format 9.999
SQL> SELECT attribute_name, explanatory_value, rank
  2  FROM ai_explain_output
  3  ORDER BY rank, attribute_name;
 
ATTRIBUTE_NAME                           EXPLANATORY_VALUE       RANK
---------------------------------------- ----------------- ----------
HOUSEHOLD_SIZE                           0.194540348609091          1
CUST_MARITAL_STATUS                      0.193586215628217          2
YRS_RESIDENCE                            0.115114555100227          3
EDUCATION                                0.105578426591968          4
AGE                                      0.103917188776063          5
OCCUPATION                               0.092052058645775          6
Y_BOX_GAMES                              0.077157404278814          7
HOME_THEATER_PACKAGE                     0.069102344332980          8
CUST_GENDER                              0.043162087348240          9
BOOKKEEPING_APPLICATION                  0.023505549979382         10
BULK_PACK_DISKETTES                                      0         11
COUNTRY_NAME                                             0         11
CUST_ID                                                  0         11
CUST_INCOME_LEVEL                                        0         11
FLAT_PANEL_MONITOR                                       0         11
OS_DOC_SET_KANJI                                         0         11
PRINTER_SUPPLIES                                         0         11
 
17 rows selected

 

发表在 12c, algorithm, Data Mining | 留下评论

Oracle 12cR2 -Part3-Json Partition Table

含有JSON列数据的表可以选择使用JSON虚拟列作为分区键,也可以使用传统的时间等方式进行分区。使用虚拟列分区的时候,虚拟列是通过JSON_VALUE函数从json列抽取出来的。每一次INSERT一条JSON记录到分区表中,都会使json_value函数执行一次,当需要insert大量的json数据时,转换的成本会比较大。

使用JSON虚拟列对表做分区的规则:
作为分区键的虚拟列必须定义在json_value表达式中;
虚拟列的数据类型由JSON_VALUE的RETURN语句返回的结果决定;
Json_value用于抽取数据的path不能包含任何predicates;
被虚拟列引用的JSON列可以声明is json约束但是不是必须。

未使用json列作为分区的表

SQL> create table t_partition
  2  (id number,
  3  create_Date date,
  4  name clob
  5  constraint data_echo_area_json check (name is json))
  6  lob(name) store as securefile
  7  partition by range(create_date)
  8  (partition p1 values less than (TO_DATE('2016-08', 'YYYY-MM')) TABLESPACE users,
  9  partition p2 values less than (TO_DATE('2016-09', 'YYYY-MM')) TABLESPACE users,
 10  partition pmax values less than(maxvalue) tablespace users);

表已创建。

采用json虚拟列作为表分区键的表

SQL> create table t_json_partition(
  2  id varchar2(32) NOT NULL PRIMARY KEY,
  3  data_echo_area CLOB,
  4  part_employee_id NUMBER GENERATED ALWAYS AS
  5       (json_value (data_echo_area, '$.employee_id' RETURNING NUMBER)))
  6  LOB (data_echo_area) STORE AS securefile
  7    PARTITION BY RANGE (part_employee_id)
  8     (PARTITION p1 VALUES LESS THAN (100),
  9      PARTITION p2 VALUES LESS THAN (200));

表已创建。
SQL> insert into t_json_partition(id,data_echo_area) values
  2  (sys_guid(),
  3  '{
  4      "employee_id": 100,
  5      "first_name": "Steven",
  6      "last_name": "King",
  7      "email": "SKING",
  8      "job_id": "AD_PRES",
  9      "salary": 24000.0,
 10      "manager_id": null,
 11      "department_id": 90,
 12      "address":{"city": "Oxford",
 13                "STATE_PROVINCE": "Oxford",
 14                "STREET_ADDRESS": "Magdalen Centre, The Oxford Science Park"
 15                }
 16     }'
 17  );

已创建 1 行。

SQL>
SQL> insert into t_json_partition(id,data_echo_area) values
  2  (sys_guid(),
  3  '{
  4      "employee_id": 101,
  5      "first_name": "Neena",
  6      "last_name": "Kochhar",
  7      "email": "NKOCHHAR",
  8      "job_id": "AD_VP",
  9      "salary": 17000.0,
 10      "manager_id": 100,
 11      "department_id": 90,
 12      "address":{"city": "South Brunswick",
 13                 "STATE_PROVINCE": "New Jersey",
 14                 "STREET_ADDRESS": "2007 Zagora St"
 15                 }
 16     }'
 17  );

已创建 1 行。

SQL> commit;

提交完成。
发表在 12c, json | 留下评论

Scikit-learn GaussianNB and Oracle Naive Bayes

Scikit-learn朴素贝叶斯模型常用的有:高斯模型、多项式模型和伯努利模型。
本文采用sklearn的高斯模型计算模型准确率同时对比了相同数据集下Oracle 朴素贝叶斯的结果,仅作学习参考,实际使用还有待继续学习研究。

原始数据集合为Oracle 11.2.0.4 DEMO用户SH中的MINING_DATA_BUILD_V,使用sql developer导出到csv文件,且保留文件头即列名。数据集中部分列有缺失数值,数据集采用’?”填充。

Python代码如下,采用pandas + sklearn来训练。

# -*- coding: utf-8 -*-
"""
Created on Tue Aug 29 12:14:19 2017

@author: Leo Zhang
"""

import pandas as pd
from sklearn import preprocessing
from sklearn.cross_validation import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

#mining_data_build.csv数据还有列名信息
mining_data_build = pd.read_csv('c:\\mining_data_build.csv', engine='python')
print("mining_data_build.isnull().sum()")
print(mining_data_build.isnull().sum())

mining_data_build_v = mining_data_build

#print(mining_data_build_v.describe(include= 'all'))

#对数据集合中的?进行处理,用pandas的describe函数的top替换,也可以使用其他的方式进行替换缺失值
# replace(['?','$'],method='pad')#向前填充
# replace(['?','$'],method='ffill')#向前填充
# replace(['?','$'],method='bfill')#向后填充 

for value in ['CUST_GENDER', 'CUST_MARITAL_STATUS', 'COUNTRY_NAME',
       'CUST_INCOME_LEVEL', 'EDUCATION', 'OCCUPATION','HOUSEHOLD_SIZE']:
    mining_data_build_v[value].replace(['?'], [mining_data_build_v.describe(include='all')[value][2]],inplace=True)

print(mining_data_build_v[value].replace(['?'], [mining_data_build_v.describe(include='all')[value][2]],inplace=True))

# cat是categorical的缩写,对字符串字段进行处理,参与计算的属性与Oracle 中使用的字段类型一致。具体参见下文oracle部分中的模型信息。未处理的字段为NUMERICAL类型
le = preprocessing.LabelEncoder()
CUST_GENDER_CAT = le.fit_transform(mining_data_build.CUST_GENDER)
CUST_MARITAL_STATUS_CAT = le.fit_transform(mining_data_build.CUST_MARITAL_STATUS)
COUNTRY_NAME_CAT   = le.fit_transform(mining_data_build.COUNTRY_NAME)
CUST_INCOME_LEVEL_CAT = le.fit_transform(mining_data_build.CUST_INCOME_LEVEL)
EDUCATION_CAT = le.fit_transform(mining_data_build.EDUCATION)
OCCUPATION_CAT = le.fit_transform(mining_data_build.OCCUPATION)
HOUSEHOLD_SIZE_CAT = le.fit_transform(mining_data_build.HOUSEHOLD_SIZE)

#print(CUST_GENDER_CAT)

mining_data_build_v['CUST_GENDER_CAT'] = CUST_GENDER_CAT
mining_data_build_v['CUST_MARITAL_STATUS_CAT'] = CUST_MARITAL_STATUS_CAT
#mining_data_build_v['COUNTRY_NAME_CAT'] = COUNTRY_NAME_CAT
#mining_data_build_v['CUST_INCOME_LEVEL_CAT'] = CUST_INCOME_LEVEL_CAT
mining_data_build_v['EDUCATION_CAT'] = EDUCATION_CAT
mining_data_build_v['OCCUPATION_CAT'] = OCCUPATION_CAT
mining_data_build_v['HOUSEHOLD_SIZE_CAT'] = HOUSEHOLD_SIZE_CAT

#从pandas DataFrame中删除categorical列
dummy_fields = ['CUST_GENDER', 'CUST_MARITAL_STATUS', 'COUNTRY_NAME', 
                  'EDUCATION', 'OCCUPATION','HOUSEHOLD_SIZE']

mining_data_build_v = mining_data_build_v.drop(dummy_fields, axis = 1)


mining_data_build_v = mining_data_build_v.reindex_axis(['CUST_GENDER_CAT','AGE','CUST_MARITAL_STATUS_CAT','EDUCATION_CAT','HOUSEHOLD_SIZE_CAT',
                'OCCUPATION_CAT','YRS_RESIDENCE','HOME_THEATER_PACKAGE','BOOKKEEPING_APPLICATION',
                 'Y_BOX_GAMES','AFFINITY_CARD'], axis = 1)

print(mining_data_build_v.head(5))


num_features = ['CUST_GENDER_CAT','AGE','CUST_MARITAL_STATUS_CAT','EDUCATION_CAT','HOUSEHOLD_SIZE_CAT',
                'OCCUPATION_CAT','YRS_RESIDENCE','HOME_THEATER_PACKAGE','BOOKKEEPING_APPLICATION',
                 'Y_BOX_GAMES']

#z-score 标准化
scaled_features = {}
for each in num_features:
    mean, std = mining_data_build_v[each].mean(), mining_data_build_v[each].std()
    scaled_features[each] = [mean, std]
    mining_data_build_v.loc[:, each] = (mining_data_build_v[each] - mean)/std
mining_data_build_v.head(5)

features = mining_data_build_v.values[:,:10]
target = mining_data_build_v.values[:,10]

print(target)
features_train, features_test, target_train, target_test = train_test_split(features,target, test_size = 0.33, random_state = 7)

clf = GaussianNB()
clf.fit(features_train, target_train)
target_pred = clf.predict(features_test)

print("GaussianNB's accuracy is: ",accuracy_score(target_test, target_pred, normalize = True))

当完成z-core标准化后,mining_data_build_v.head(5)数据格式如下

CUST_GENDER_CAT       AGE  CUST_MARITAL_STATUS_CAT  EDUCATION_CAT  HOUSEHOLD_SIZE_CAT  OCCUPATION_CAT  YRS_RESIDENCE  HOME_THEATER_PACKAGE   BOOKKEEPING_APPLICATION  Y_BOX_GAMES  AFFINITY_CARD
0        -1.443963  0.154586           0.679725       1.009408              -0.648307        0.790136      -0.046158              0.858854                  0.367985    -0.633720              0
1         0.692077 -0.872079           0.679725       0.359992              -0.648307        1.311677      -0.566742             -1.163566                  0.367985     1.576932              0
2        -1.443963 -1.385411           0.679725       0.684700              -0.648307       -1.296031      -1.087327             -1.163566                  0.367985     1.576932              0
3         0.692077  0.447919          -0.049072       0.359992               0.066212       -0.774489       0.474426              0.858854                  0.367985    -0.633720              1
4         0.692077 -0.358746           0.679725       1.009408               2.209770        1.311677       0.474426             -1.163566                  0.367985    -0.633720              1
预测精确度
GaussianNB's accuracy is:  0.759595959596

Oracle朴素贝叶斯的输出

SQL> BEGIN
  2    INSERT INTO nb_sh_sample_settings (setting_name, setting_value) VALUES
  3      (dbms_data_mining.prep_auto,dbms_data_mining.prep_auto_on);    --------------这里是Automatic Data Preparation 
  4    INSERT INTO nb_sh_sample_settings VALUES
  5      (dbms_data_mining.clas_priors_table_name, 'nb_sh_sample_priors');
  6  END;
  7  /

PL/SQL 过程已成功完成。
SQL> -- DISPLAY MODEL SIGNATURE
SQL> --
SQL> column attribute_name format a40
SQL> column attribute_type format a20
SQL> SELECT attribute_name, attribute_type
  2    FROM user_mining_model_attributes
  3   WHERE model_name = 'NB_SH_CLAS_SAMPLE'
  4  ORDER BY attribute_name;

ATTRIBUTE_NAME                           ATTRIBUTE_TYPE
---------------------------------------- --------------------
AFFINITY_CARD                            CATEGORICAL
AGE                                       NUMERICAL
BOOKKEEPING_APPLICATION                   NUMERICAL
CUST_GENDER                               CATEGORICAL
CUST_MARITAL_STATUS                       CATEGORICAL
EDUCATION                                 CATEGORICAL
HOME_THEATER_PACKAGE   -                  NUMERICAL
HOUSEHOLD_SIZE                            CATEGORICAL
OCCUPATION                                CATEGORICAL
YRS_RESIDENCE                             NUMERICAL
Y_BOX_GAMES                               NUMERICAL


SQL> -- Compute Test Metrics
SQL> DECLARE
  2    v_accuracy         NUMBER;
  3    v_area_under_curve NUMBER;
  4  BEGIN
  5     DBMS_DATA_MINING.COMPUTE_CONFUSION_MATRIX (
  6       accuracy                    => v_accuracy,
  7       apply_result_table_name     => 'nb_sh_sample_test_apply',
  8       target_table_name           => 'nb_sh_sample_test_targets',
  9       case_id_column_name         => 'cust_id',
 10       target_column_name          => 'affinity_card',
 11       confusion_matrix_table_name => 'nb_sh_sample_confusion_matrix',
 12       score_column_name           => 'PREDICTION',   -- default
 13       score_criterion_column_name => 'PROBABILITY',  -- default
 14       cost_matrix_table_name      => 'nb_sh_cost');
 15     DBMS_OUTPUT.PUT_LINE('**** MODEL ACCURACY ****: ' || ROUND(v_accuracy,4));
 16  
 17     DBMS_DATA_MINING.COMPUTE_LIFT (
 18       apply_result_table_name => 'nb_sh_sample_test_apply',
 19       target_table_name       => 'nb_sh_sample_test_targets',
 20       case_id_column_name     => 'cust_id',
 21       target_column_name      => 'affinity_card',
 22       lift_table_name         => 'nb_sh_sample_lift',
 23       positive_target_value   => '1',
 24       num_quantiles           => '10',
 25       cost_matrix_table_name  => 'nb_sh_cost');
 26  
 27     DBMS_DATA_MINING.COMPUTE_ROC (
 28       roc_area_under_curve        => v_area_under_curve,
 29       apply_result_table_name     => 'nb_sh_sample_test_apply',
 30       target_table_name           => 'nb_sh_sample_test_targets',
 31       case_id_column_name         => 'cust_id',
 32       target_column_name          => 'affinity_card',
 33       roc_table_name              => 'nb_sh_sample_roc',
 34       positive_target_value       => '1',
 35       score_column_name           => 'PREDICTION',
 36       score_criterion_column_name => 'PROBABILITY');
 37     DBMS_OUTPUT.PUT_LINE('**** AREA UNDER ROC CURVE ****: ' ||
 38       ROUND(v_area_under_curve,4));
 39  END;
 40  /
**** MODEL ACCURACY ****: .7733
发表在 Data Mining, Machine Learning | 留下评论