Basic Stat Using R 描述性统计函数 myvars <-mtcars[c("mpg","hp","ut","am")] summary(myvars)#对一个数据集进行详细统计 # mpg hp t am ##Min. :10.40 Min. :52.0 Min. :1.513Min. :0.0000 #1stQu.:15.4318t0u.:96.518tQu.:2.581 18tQu.:0.0000 ##Median :19.20 Median 123.0 Median :3.325 Median 0.0000 ##Mean :20.09 Mean:146.7 Mean:3.217 Mean:0.4062 #3rdQu.:22.803rdQu.:180.03rdQu.:3.6103rdQu.:1.0000 #Max.:33.90 Max. :335.0Max. :5.424 Max. :1.0000 fivenum(myvars$hp) #[1]5296123180335 来五个分位数 install.packages("Hmisc") #Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1' ##(as 'lib'is unspecified) library(Hmisc) #Loading required package:lattice #Loading required package:survival #Loading required package:Formula #Loading required package:ggplot2 #Attaching package:'Hmisc' #The following objects are masked from'package:base': 料样 format.pval,units 1
Basic Stat Using R 描述性统计函数 myvars <- mtcars[c("mpg","hp","wt","am")] summary(myvars) # 对一个数据集进行详细统计 ## mpg hp wt am ## Min. :10.40 Min. : 52.0 Min. :1.513 Min. :0.0000 ## 1st Qu.:15.43 1st Qu.: 96.5 1st Qu.:2.581 1st Qu.:0.0000 ## Median :19.20 Median :123.0 Median :3.325 Median :0.0000 ## Mean :20.09 Mean :146.7 Mean :3.217 Mean :0.4062 ## 3rd Qu.:22.80 3rd Qu.:180.0 3rd Qu.:3.610 3rd Qu.:1.0000 ## Max. :33.90 Max. :335.0 Max. :5.424 Max. :1.0000 fivenum(myvars$hp) ## [1] 52 96 123 180 335 # 五个分位数 install.packages("Hmisc") ## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1' ## (as 'lib' is unspecified) library(Hmisc) ## Loading required package: lattice ## Loading required package: survival ## Loading required package: Formula ## Loading required package: ggplot2 ## ## Attaching package: 'Hmisc' ## The following objects are masked from 'package:base': ## ## format.pval, units 1
describe(myvars) #myvars #排 ##4 Variables 32 Observations ##mpg n missing distinct Info Mean Gmd .05 .10 特转 32 0 25 0.999 20.096.79612.0014.34 25 50 .75 .90 .95 #拼 15.43 19.20 22.80 30.09 31.30 排排 #1 owest:10.413.314.314.715.0,highest:26.027.330.432.433.9 hp #拼 n missing distinct Info Mean Gmd .05 .10 0 22 0.997 146.7 77.0463.6566.00 标 .25 .50 .75 .90 .95 96.50 123.00180.00 243.50253.55 #10we8t:5262656691,h1ghe8t:215230245264335 #wt n missing distinct Info Mean 05 32 0 29 0.999 3.217 1.0891.7361.956 26 .50 .75 90 96 2.581 3.3253.610 4.048 5.293 ¥#1 owest:1.5131.6151.8351.9352.140,highest:3.8454.0705.2505.3455.424 #am n missing distinct Info Sum Mean Gmd 0 2 0.724 130.40620.498 #排 #五个分位数 install.packages("pastecs") #Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1
describe(myvars) ## myvars ## ## 4 Variables 32 Observations ## -------------------------------------------------------------------------------- ## mpg ## n missing distinct Info Mean Gmd .05 .10 ## 32 0 25 0.999 20.09 6.796 12.00 14.34 ## .25 .50 .75 .90 .95 ## 15.43 19.20 22.80 30.09 31.30 ## ## lowest : 10.4 13.3 14.3 14.7 15.0, highest: 26.0 27.3 30.4 32.4 33.9 ## -------------------------------------------------------------------------------- ## hp ## n missing distinct Info Mean Gmd .05 .10 ## 32 0 22 0.997 146.7 77.04 63.65 66.00 ## .25 .50 .75 .90 .95 ## 96.50 123.00 180.00 243.50 253.55 ## ## lowest : 52 62 65 66 91, highest: 215 230 245 264 335 ## -------------------------------------------------------------------------------- ## wt ## n missing distinct Info Mean Gmd .05 .10 ## 32 0 29 0.999 3.217 1.089 1.736 1.956 ## .25 .50 .75 .90 .95 ## 2.581 3.325 3.610 4.048 5.293 ## ## lowest : 1.513 1.615 1.835 1.935 2.140, highest: 3.845 4.070 5.250 5.345 5.424 ## -------------------------------------------------------------------------------- ## am ## n missing distinct Info Sum Mean Gmd ## 32 0 2 0.724 13 0.4062 0.498 ## ## -------------------------------------------------------------------------------- # 五个分位数 install.packages("pastecs") ## Installing package into '/cloud/lib/x86_64-pc-linux-gnu-library/4.1' 2
##(as 'lib'is unspecified) library(pastecs) stat.desc(myvars) t ##nbr.val 32.0000000 32.000000032.000000032.00000000 ##nbr.null 0.0000000 0.0000000 0.000000019.00000000 ##nbr.na 0.0000000 0.0000000 0.00000000.00000000 #min 10.4000000 52.0000000 1.51300000.00000000 料nax 33.9000000 335.0000000 5.4240000 1.00000000 ##range 23.5000000283.0000000 3.91100001.00000000 ##sum 642.90000004694.0000000102.952000013.00000000 #median 19.2000000 123.0000000 3.32500000.00000000 #mean 20.0906250 146.6875000 3.21725000.40625000 ##SE.mean 1.0654240 12.1203173 0.17296850.08820997 #CI.mean.0.95 2.1729465 24.7195501 0.35277150.17990541 ##var 36.32410284700.866935 0.9573790 0.24899194 ##std.dev 6.0269481 68.5628685 0.97845740.49899092 ##coef.var 0.2999881 0.4674077 0.3041285 1.22828533 stat.desc(myvars,basic-T) mpg hp t am ##nbr.val 32.0000000 32.000000032.000000032.00000000 #nbr.null 0.0000000 0.0000000 0.000000019.00000000 #nbr.na 0.0000000 0.0000000 0.00000000.00000000 ##min 10.4000000 52.0000000 1.51300000.00000000 ##max 33.9000000335.0000000 5.42400001.00000000 range 23.5000000283.0000000 3.91100001.00000000 ##sum 642.90000004694.0000000102.952000013.00000000 #料median 19.2000000123.0000000 3.32500000.00000000 ##mean 20.0906250 146.6875000 3.21725000.40625000 ##SE.mean 1.0654240 12.1203173 0.17296850.08820997 ##CI.mean.0.95 2.1729465 24.7195501 0.35277150.17990541 ##var 36,32410284700.8669355 0.95737900.24899194 ##std.dev 6.0269481 68.5628685 0.97845740.49899092 ##coef.var 0.29998810.4674077 0.30412851.22828533 basic=T计算一些基础值,例如缺失值的数量等等 stat.desc(myvars,desc=T)
## (as 'lib' is unspecified) library(pastecs) stat.desc(myvars) ## mpg hp wt am ## nbr.val 32.0000000 32.0000000 32.0000000 32.00000000 ## nbr.null 0.0000000 0.0000000 0.0000000 19.00000000 ## nbr.na 0.0000000 0.0000000 0.0000000 0.00000000 ## min 10.4000000 52.0000000 1.5130000 0.00000000 ## max 33.9000000 335.0000000 5.4240000 1.00000000 ## range 23.5000000 283.0000000 3.9110000 1.00000000 ## sum 642.9000000 4694.0000000 102.9520000 13.00000000 ## median 19.2000000 123.0000000 3.3250000 0.00000000 ## mean 20.0906250 146.6875000 3.2172500 0.40625000 ## SE.mean 1.0654240 12.1203173 0.1729685 0.08820997 ## CI.mean.0.95 2.1729465 24.7195501 0.3527715 0.17990541 ## var 36.3241028 4700.8669355 0.9573790 0.24899194 ## std.dev 6.0269481 68.5628685 0.9784574 0.49899092 ## coef.var 0.2999881 0.4674077 0.3041285 1.22828533 stat.desc(myvars,basic=T) ## mpg hp wt am ## nbr.val 32.0000000 32.0000000 32.0000000 32.00000000 ## nbr.null 0.0000000 0.0000000 0.0000000 19.00000000 ## nbr.na 0.0000000 0.0000000 0.0000000 0.00000000 ## min 10.4000000 52.0000000 1.5130000 0.00000000 ## max 33.9000000 335.0000000 5.4240000 1.00000000 ## range 23.5000000 283.0000000 3.9110000 1.00000000 ## sum 642.9000000 4694.0000000 102.9520000 13.00000000 ## median 19.2000000 123.0000000 3.3250000 0.00000000 ## mean 20.0906250 146.6875000 3.2172500 0.40625000 ## SE.mean 1.0654240 12.1203173 0.1729685 0.08820997 ## CI.mean.0.95 2.1729465 24.7195501 0.3527715 0.17990541 ## var 36.3241028 4700.8669355 0.9573790 0.24899194 ## std.dev 6.0269481 68.5628685 0.9784574 0.49899092 ## coef.var 0.2999881 0.4674077 0.3041285 1.22828533 #basic=T 计算一些基础值,例如缺失值的数量等等 stat.desc(myvars,desc=T) 3
mpg 如 a叫 #nbr.val 32.000000032.000000032.000000032.00000000 nbr.null 0.0000000 0.0000000 0.000000019.00000000 ##nbr.na 0.0000000 0.0000000 0.00000000.00000000 #min 10.4000000 52.0000000 1.51300000.00000000 ##max 33.9000000 335.0000000 5.4240000 1.00000000 ##range 23.5000000283.0000000 3.91100001.00000000 642.90000004694.0000000 102.952000013.00000000 #median 19.2000000123.0000000 3.32500000.00000000 #mean 20.0906250 146.6875000 3.21725000.40625000 #SE.mean 1.0654240 12.1203173 0.17296850.08820997 #样C.mean.0.95 2.1729465 24.7195501 0.35277150.17990541 ##var 36.32410284700.8669355 0.95737900.24899194 ##std.dev 6.0269481 68.5628685 0.97845740.49899092 ##coef.var 0.2999881 0.4674077 0.3041285 1.22828533 #desc=T计算一些描述值,例如中位数、分位数等等 stat.desc(myvars,norm-T) ## mpg hp am ##nbr.val 32.0000000 32.0000000032.000000003.200000e+01 #nbr.null 0.0000000 0.00000000 0.00000000 1.900000e+01 ##nbr.na 0.0000000 0.00000000 0.000000000.000000e+00 #min 10.4000000 52.00000000 1.51300000 0.000000e+00 ##max 33.9000000335.00000000 5.424000001.000000e+00 ##range 23.5000000 283.00000000 3.911000001.000000e+00 #sum 642.90000004694.00000000102.952000001.300000e+01 #median 19.2000000 123.00000000 3.325000000.000000e+00 ##mean 20.0906250 146.68750000 3.217250004.062500e-01 ##SE.mean 1.0654240 12.12031731 0.172968478.820997e-02 #CI.mean.0.95 2.1729465 24.71955013 0.352771531.799054e-01 #排Var 36.32410284700.86693548 0.957378972.489919e-01 ##std.dev 6.0269481 68.56286849 0.97845744 4.989909e-01 #coef.var 0.2999881 0.46740771 0.304128511.228285e+00 #skewness 0.6106550 0.72602366 0.423146463.640159e-01 ##skew.2SE 0.7366922 0.87587259 0.510482524.391476e-01 ##kurtosis 0.3727660 -0.13555112 -0.02271075-1.924741e+00 ##kurt.2SE -0.2302812 -0.08373853 -0.01402987-1.189035e+00 #normtest.W 0.9475647 0.933419340.943257726.2507449-01
## mpg hp wt am ## nbr.val 32.0000000 32.0000000 32.0000000 32.00000000 ## nbr.null 0.0000000 0.0000000 0.0000000 19.00000000 ## nbr.na 0.0000000 0.0000000 0.0000000 0.00000000 ## min 10.4000000 52.0000000 1.5130000 0.00000000 ## max 33.9000000 335.0000000 5.4240000 1.00000000 ## range 23.5000000 283.0000000 3.9110000 1.00000000 ## sum 642.9000000 4694.0000000 102.9520000 13.00000000 ## median 19.2000000 123.0000000 3.3250000 0.00000000 ## mean 20.0906250 146.6875000 3.2172500 0.40625000 ## SE.mean 1.0654240 12.1203173 0.1729685 0.08820997 ## CI.mean.0.95 2.1729465 24.7195501 0.3527715 0.17990541 ## var 36.3241028 4700.8669355 0.9573790 0.24899194 ## std.dev 6.0269481 68.5628685 0.9784574 0.49899092 ## coef.var 0.2999881 0.4674077 0.3041285 1.22828533 #desc=T 计算一些描述值,例如中位数、分位数等等 stat.desc(myvars,norm=T) ## mpg hp wt am ## nbr.val 32.0000000 32.00000000 32.00000000 3.200000e+01 ## nbr.null 0.0000000 0.00000000 0.00000000 1.900000e+01 ## nbr.na 0.0000000 0.00000000 0.00000000 0.000000e+00 ## min 10.4000000 52.00000000 1.51300000 0.000000e+00 ## max 33.9000000 335.00000000 5.42400000 1.000000e+00 ## range 23.5000000 283.00000000 3.91100000 1.000000e+00 ## sum 642.9000000 4694.00000000 102.95200000 1.300000e+01 ## median 19.2000000 123.00000000 3.32500000 0.000000e+00 ## mean 20.0906250 146.68750000 3.21725000 4.062500e-01 ## SE.mean 1.0654240 12.12031731 0.17296847 8.820997e-02 ## CI.mean.0.95 2.1729465 24.71955013 0.35277153 1.799054e-01 ## var 36.3241028 4700.86693548 0.95737897 2.489919e-01 ## std.dev 6.0269481 68.56286849 0.97845744 4.989909e-01 ## coef.var 0.2999881 0.46740771 0.30412851 1.228285e+00 ## skewness 0.6106550 0.72602366 0.42314646 3.640159e-01 ## skew.2SE 0.7366922 0.87587259 0.51048252 4.391476e-01 ## kurtosis -0.3727660 -0.13555112 -0.02271075 -1.924741e+00 ## kurt.2SE -0.2302812 -0.08373853 -0.01402987 -1.189035e+00 ## normtest.W 0.9475647 0.93341934 0.94325772 6.250744e-01 4
#normtest.p0.12288140.048808240.092654997.836354e-08 nom=T计算一些正态分布统计量,例如偏度、峰度等 独立性检验函数 独立性检验:是根据频数信息判断两类因子彼此相关或互相独立的假设检验 卡方检验Fisher检验Cochran-Mantel-Haenszel检验 p-value就是probability值,是通过计算得到的概率值,即在原假设为真时,得到最大的/超出所得到 的检验统计量值的概率 一般将p值定位到0.05当p<0.05拒绝原假设当p>0.05不拒绝原假设 相关性分析 进行过独立性检验后,才能进行相关分析 相关性衡量指标 Pearson相关系数Spearman相关系数Kendall相关系数偏相关系数多分格(polychoric)相关系数多系 列(polyserial)相关系数 ?cor #通过method可以计算pearson\化endall\spearman相关系数 state.x77 Population Income Illiteracy Life Exp Murder HS Grad Frost #Alabama 3615 3624 2.1 69.0515.1 41.3 20 ##Alaska 365 6315 1.5 69.31 11.3 66.7 152 ##Arizona 2212 4530 1.8 70.55 7.8 58.1 #Arkansas 2110 3378 1.9 70.66 10.1 39.9 ##California 21198 5114 1.1 71.71 10.3 62.6 520 ##Colorado 2541 4884 0.7 72.06 6.8 63.9 166 #Connecticut 3100 5348 1.1 72.48 3.1 56.0 139 Delaware 579 4809 0.9 70.06 6.2 54.6 103 #Florida 8277 4815 1.3 70.66 10.7 52.6 11 ##Georgia 4931 4091 2.0 68.54 13.9 40.6 ##Hawaii 868 4963 1.9 73.60 6.2 61.9 ##Idaho 813 4119 0.6 71.87 59.5 #Illinois 11197 5107 0.9 70.14 103 52.6 127 ##Indiana 5313 4458 0.7 70.8 7.1 52.9 122 ##Iowa 2861 4628 0.5 72.56 2.3 59.0 140
## normtest.p 0.1228814 0.04880824 0.09265499 7.836354e-08 #norm=T 计算一些正态分布统计量,例如偏度、峰度等 独立性检验函数 独立性检验:是根据频数信息判断两类因子彼此相关或互相独立的假设检验 卡方检验 Fisher 检验 Cochran-Mantel-Haenszel 检验 p-value 就是 probability 值,是通过计算得到的概率值,即在原假设为真时,得到最大的/超出所得到 的检验统计量值的概率 一般将 p 值定位到 0.05 当 p<0.05 拒绝原假设当 p>0.05 不拒绝原假设 相关性分析 进行过独立性检验后,才能进行相关分析 相关性衡量指标 Pearson 相关系数 Spearman 相关系数 Kendall 相关系数偏相关系数多分格 (polychoric) 相关系数多系 列 (polyserial) 相关系数 ?cor # 通过 method 可以计算 pearson\kendall\spearman 相关系数 state.x77 ## Population Income Illiteracy Life Exp Murder HS Grad Frost ## Alabama 3615 3624 2.1 69.05 15.1 41.3 20 ## Alaska 365 6315 1.5 69.31 11.3 66.7 152 ## Arizona 2212 4530 1.8 70.55 7.8 58.1 15 ## Arkansas 2110 3378 1.9 70.66 10.1 39.9 65 ## California 21198 5114 1.1 71.71 10.3 62.6 20 ## Colorado 2541 4884 0.7 72.06 6.8 63.9 166 ## Connecticut 3100 5348 1.1 72.48 3.1 56.0 139 ## Delaware 579 4809 0.9 70.06 6.2 54.6 103 ## Florida 8277 4815 1.3 70.66 10.7 52.6 11 ## Georgia 4931 4091 2.0 68.54 13.9 40.6 60 ## Hawaii 868 4963 1.9 73.60 6.2 61.9 0 ## Idaho 813 4119 0.6 71.87 5.3 59.5 126 ## Illinois 11197 5107 0.9 70.14 10.3 52.6 127 ## Indiana 5313 4458 0.7 70.88 7.1 52.9 122 ## Iowa 2861 4628 0.5 72.56 2.3 59.0 140 5