「統計解析のはなし―データに語らせるテクニック」８章

大村平「統計解析のはなし―データに語らせるテクニック」日科技連　の読後行間補充メモ

同書籍は、印刷数表と手計算による統計解析を紹介した良書。

本稿では、同書籍の設例について、プログラムによる演算処理をした場合の例を示す。

コードと出力例は、Python［Google Colaboratory］

８章　複雑さをばらす法　～多変量解析のはなし～

p231　主要な因子を選び出す　表8.2

検定：因子が評価を左右する影響力を有するか。

因子分析：評価に影響を及ぼす主要な因子を選びだす。

import pandas as pd

# 基礎データ
A = [3,1,2,3,1]
B = [2,3,1,2,2]
C = [3,2,2,3,1]
D = [2,3,1,2,3]

DF = pd.DataFrame([A,B,C,D]).T
index_info = ['甲','乙','丙','丁','戊']
col_info = ['A子','B子','C子','D子']
DF.index =index_info
DF.columns = col_info
DF # 表8.2

	A子	B子	C子	D子
甲	3	2	3	2
乙	1	3	2	3
丙	2	1	2	1
丁	3	2	3	2
戊	1	2	1	3

p233　表8.4　相関係数の一覧表

# コードは上記の続き
import math

# 関数（２つのリストの相関係数）
def correlate(list1, list2):
  s_1 = pd.Series(list1)
  s_2 = pd.Series(list2)
  res = round(s_1.corr(s_2),2)
  return res

# 関数（DataFrameの n行と m行の相関係数）
def corre(n,m):
  list1 = DF.iloc[:,n]
  list2 = DF.iloc[:,m]
  return correlate(list1,list2) # 相関係数の関数

# 全行と全行の相関係数の一覧
line_total = []
for n in range(4): #各行
  n_line = []
  for m in range(4): #各行
    if n == m: # 同一人どおしの比較はスキップ
      n_line.append(float('nan'))
    else:
      n_line.append(corre(m,n)) #ある行と全行の相関係数
  line_total.append(n_line) #全行と全行

# DFに整理
DF84 = pd.DataFrame(line_total,index= col_info)
DF84.columns= col_info

# 関数　セル情報に該当する行・列の情報
def row_col(dataframe,value): # データフレーム、セル情報
  for row in range(dataframe.shape[0]): # 行数
    for column in range(dataframe.shape[1]): # 列数
      if dataframe.iloc[row][column] == value: # セル情報と一致する場合
        return [dataframe.index[row], dataframe.columns[column]] # 該当セルの行と列の番号を取得
      else:
        break

# 相関係数の最大・最小値の情報
maxis = DF84.max().max() # 最大値
print(f'相関係数の最大値：{maxis}は、{row_col(DF84, maxis)[1]}と{row_col(DF84, maxis)[0]}')
minis = DF84.min().min() # 最小値
print(f'相関係数の最小値：{minis}は、{row_col(DF84, minis)[1]}と{row_col(DF84, minis)[0]}')

DF84 # 表8.4

相関係数の最大値：0.9は、A子とC子
相関係数の最小値：-0.6は、A子とD子

p234　表8.5　主要な因子を選び出す

正の相関あり　→特徴が共通の因子に着目

負の相関あり　→特徴が反する因子に着目

import pandas as pd

# 基礎データ
Height = ['tall','short','short','tall']
Shape = ['fat','lean','lean','fat']
Chest = ['big','big','big','flat']
Waist = ['wide','narrow','narrow','wide']
Color = ['white','black','black','white']
Face = ['horizon','vertical','horizon','vertical']
Hair = ['wave','straight','straight','wave']
Voice = ['husky','chimes','husky','chimes']

DFB = pd.DataFrame([Height,Shape,Chest,Waist,Color,Face,Hair,Voice])
index_info = ['Height','Shape','Chest','Waist','Color','Face','Hair','Voice']
col_info = ['A','B','C','D']
DFB.index =index_info
DFB.columns = col_info

# 因子分析
DFB = DFB.assign(AC="", BD="", AD="")
for item in range(len(DFB)):
  if DFB['A'][item] == DFB['C'][item]:
    DFB['AC'][item] = '◎'
  if DFB['B'][item] == DFB['D'][item]:
    DFB['BD'][item] = '〇'
  if not DFB['A'][item] == DFB['D'][item]:
    DFB['AD'][item] = '△'
DFB # 表8.5

p235　因子の候補が無い場合→ベクトルの使用

import pandas as pd
import math

# 基礎データ
TT = [1,-1,1,-1,1,-1,-1,1,1,-1]
SW = [1,-1,-1,-1,1,-1,1,1,1,-1]

# DF化
DF = pd.DataFrame([TT,SW]).T
index_info = ['甲','乙','丙','丁','戊','己','庚','辛','壬','癸']
col_info = ['卓球','相撲']
DF.index =index_info
DF.columns = col_info
print(f'{DF}\n')

# ベクトル角
res = DF['卓球'].corr(DF['相撲']) # 相関係数
pidegrees = math.acos(res) # ベクトルの内角
degrees = math.degrees(pidegrees) # ラジアン角度　→度数
print(f'相関係数：{round(res,1)}\nベクトルの内角：{round(degrees,1)}度')

   卓球  相撲
甲   1   1
乙  -1  -1
丙   1  -1
丁  -1  -1
戊   1   1
己  -1  -1
庚  -1   1
辛   1   1
壬   1   1
癸  -1  -1

相関係数：0.6
ベクトルの内角：53.1度

p243～244　因子の候補が無い場合　図8.5（因子ベクトル）

import numpy as np
import matplotlib.pyplot as plt
from matplotlib import patches

remainder_angle = (90 - degrees)/2
print(f'余角：{round(remainder_angle,2)}度')

# ベクトルPの角度
P_degrees = remainder_angle + degrees
print(f'ベクトルＰの角度：{round(P_degrees,2)}度')
P_pi = math.radians(P_degrees) # 度数　→ラジアン角度

# ベクトルＳの角度
S_degrees = remainder_angle
print(f'ベクトルＳの角度：{round(S_degrees,2)}度')
S_pi = math.radians(S_degrees)

# ベクトル
origin = np.array([0, 0]) # 原点
vecP = np.array([math.cos(P_pi), math.sin(P_pi)]) # P終点
vecS = np.array([math.cos(S_pi), math.sin(S_pi)]) # S終点
vector = [(origin, vecP, 'blue'), # ベクトルP
          (origin, vecS, 'green')] # ベクトルS

# グラフの大きさ
fig ,ax= plt.subplots(figsize=(5,5))

# ベクトル描画
for tail, head, color in vector:
    ax.quiver(*tail,*head, color=color, units='xy', scale=1)
ax.text(vecP[0]*1.05,vecP[1]*1.05, "$\overrightarrow{P}$", color = "blue", size = 13) # Pの座標表示
ax.text(vecS[0]*1.05,vecS[1]*1.05, "$\overrightarrow{S}$", color = "green", size = 13) # Sの座標表示

# 円弧
patchP = patches.Arc(xy=(0, 0), width=0.9, height=0.9, color="red", theta1 =S_degrees, theta2 =P_degrees)
patchS = patches.Arc(xy=(0, 0), width=1, height=1, color="red", theta1 =0, theta2 =S_degrees)
patchY = patches.Arc(xy=(0, 0), width=1, height=1, color="red", theta1 =P_degrees, theta2 = 90)
ax.add_patch(patchP)
ax.add_patch(patchS)
ax.add_patch(patchY)

# 円弧の角度表示
loc_PSx = math.cos((P_pi+S_pi)/2)/2 # PSベクトルの中間位置
loc_PSy = math.sin((P_pi+S_pi)/2)/2
loc_Sx = math.cos(S_pi/2)/2 # Sベクトルとｘ軸の中間位置
loc_Sy = math.sin(S_pi/2)/2
loc_Px = math.sin(S_pi/2)/2/2 # Pベクトルとｙ軸の中間位置（位置補正あり）
loc_Py = math.cos(S_pi/2)/2*1.1
ax.text(loc_PSx, loc_PSy, f'{round(degrees,1)}'"$^{\circ}$", color = "red", size = 11)
ax.text(loc_Sx, loc_Sy, f'{round(remainder_angle,1)}'"$^{\circ}$", color = "red", size = 11)
ax.text(loc_Px, loc_Py, f'{round(remainder_angle,1)}'"$^{\circ}$", color = "red", size = 11)

# 表題、軸ラベル
ax.set_title('Factor Vector', fontsize =11)
ax.set_xlabel('$\overrightarrow{x}$', fontsize =12) # x軸ラベル
ax.set_ylabel('$\overrightarrow{y}$', fontsize =12) # y軸ラベル
ax.set_xlim(0, 1.2) # x軸の範囲
ax.set_ylim(0, 1.2) # y軸の範囲
plt.show()

余角：18.43度
ベクトルＰの角度：71.57度
ベクトルＳの角度：18.43度

p247　主な成分を見つけ出す　表8.9

import pandas as pd
import math

# 設例のDF化
A = [5,4,3,2,1]
B = [5,2,4,1,3]
index_info = ['A子','B子']
columns_info = ['馬','犬','豚','バッタ','ウニ']
DF = pd.DataFrame([A,B],index=index_info, columns=columns_info )

# 関数（２つのリストの相関係数）
def correlate(list1, list2):
  s_1 = pd.Series(list1)
  s_2 = pd.Series(list2)
  res = round(s_1.corr(s_2),2)
  return res

# 相関係数
corre = correlate(A,B)
print(f'相関係数：{corre}')

# ベクトル角度
acos = math.acos(corre) #逆余弦(ラジアン)
d_degree = math.degrees(acos) #ラジアン→度数表記
print(f'ベクトル角度：{round(d_degree,2)}度')

DF

相関係数：0.5
ベクトル角度：60.0度

	馬	犬	豚	バッタ	ウニ
A子	5	4	3	2		1
B子	5	2	4	1		3

p251　主成分に分解　図8.7

from matplotlib import pyplot as plt
import japanize_matplotlib

fig = plt.figure(figsize=(5, 5)) # 図版の大きさ
ax = fig.add_subplot(111) # グラフ１つ
ax.set_aspect("equal") # 直交座標

# 各点をプロット
plt.scatter(A,B,20, color ="red", marker="D") # 各リスト、点の大きさ、色、種類

# 描画範囲
xlower, xupper = -2, 6 
ylower, yupper = -2, 6
plt.xlim(xlower, xupper)
plt.ylim(ylower, yupper)

# 点名称をプロット
for col in range(len(columns_info)):
  ax.text(DF.iloc[0][col],DF.iloc[1][col],DF.columns[col])

# 座標軸
plt.hlines(0, -3, 6, color ='black') # x軸
plt.vlines(0, -3, 6, color ='black') # y軸
plt.plot([xlower, xupper], [ylower, yupper], color="lightblue", linewidth=5, label="第１主成分軸")
plt.plot([xlower, 2], [2, ylower], color="lightgreen", linewidth=5, label="第２主成分軸")

# 関数　垂線の足の座標（一般）
def Perpendicular_Legs(a,b,c,x,y): # 点(x,y) から ax+by+c=0 へ垂線 
  return [(x*b**2-a*b*y-a*c)/(a**2+b**2),(y*a**2-a*b*x-b*c)/(a**2+b**2)]

# 関数　垂線の足の座標（第２主成分軸用）
def PLEG(x,y): # 点(x,y) から ax+by+c=0 へ垂線
  a, b, c = 1,1,0
  return Perpendicular_Legs(a,b,c,x,y)

# 各点から第２主成分軸への垂線  
for i in range(5):
  x_value = DF.iloc[0][i]
  y_value = DF.iloc[1][i]
  plt.plot([PLEG(x_value,y_value)[0],DF.iloc[0][i]],
   [PLEG(x_value,y_value)[1],DF.iloc[1][i]],
           color='red', linestyle="--") # 各点と垂線足とを破線連結

#グラフの軸名称
plt.xlabel('Score_by_A')
plt.ylabel('Score_by_B')
plt.legend()
plt.grid(True)

このブログを検索

The world inside a pillow

「統計解析のはなし―データに語らせるテクニック」８章

８章　複雑さをばらす法　～多変量解析のはなし～

p231　主要な因子を選び出す　表8.2

p233　表8.4　相関係数の一覧表

p234　表8.5　主要な因子を選び出す

p235　因子の候補が無い場合→ベクトルの使用

p247　主な成分を見つけ出す　表8.9

「統計解析のはなし―データに語らせるテクニック」８章

８章 複雑さをばらす法 ～多変量解析のはなし～

p231 主要な因子を選び出す 表8.2

p233 表8.4 相関係数の一覧表

p234 表8.5 主要な因子を選び出す

p235 因子の候補が無い場合→ベクトルの使用

p247 主な成分を見つけ出す 表8.9

８章　複雑さをばらす法　～多変量解析のはなし～

p231　主要な因子を選び出す　表8.2

p233　表8.4　相関係数の一覧表

p234　表8.5　主要な因子を選び出す

p235　因子の候補が無い場合→ベクトルの使用

p247　主な成分を見つけ出す　表8.9