2009-10-02 28 views
8

Tengo un marco de datos, y quiero producir una tabla de estadísticas de resumen que incluya el número de valores numéricos válidos, media y sd por grupo para cada una de las tres columnas. Parece que no puedo encontrar ninguna función para contar el número de valores numéricos en R. Puedo usar length() que me dice cuántos valores hay, y puedo usar colSums (is.na (x)) para contar el número de valores NA, pero colSums (is.numeric (x)) no funciona de la misma manera.Cómo contar el número de valores numéricos en una columna

Podría usar tapply con {longitud - número de valores de NA - cantidad de valores en blanco - cantidad de valores de texto} pero seguramente hay una manera más fácil.

Mis datos (Quiero grupo por nominal y producen estadísticas de resumen sobre el real, y LinPred QualPred)

structure(list(Nominal = c(1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250, 1, 
3, 6, 10, 30, 50, 150, 250, 1, 3, 6, 10, 30, 50, 150, 250), Actual = c(NA, 
0.422, 0.782, 1.25, 3.85, 6.94, 18.8, 31.2, 0.118, 0.361, 0.747, 
1.18, 3.58, 5.82, 16.7, 29, 0.113, 0.382, 0.692, 1.12, 3.51, 
5.43, 17.1, 28.7, 0.134, 0.402, 0.718, 1.25, 3.65, 6.52, NA, 
28.8, 0.123, 0.396, 0.664, 1.12, 3.83, 5.6, NA, 28.1, 0.112, 
0.341, 0.7, 1.08, 3.25, 5.97, NA, 27.1, 0.106, 0.35, 0.674, 1.14, 
3.28, 5.5, 17.3, 30, 0.122, 0.321, 0.673, 1.22, 3.41, 5.85, 17.6, 
28.1, 0.129, 0.351, 0.737, 1.06, 3.39, 5.53, 15.9, 28.5), LinPred = c(NA, 
3.49519490135683, 6.4706724568458, 10.3387932789814, 31.8283534019573, 
57.3678690865708, 155.393324109068, 257.881995464799, 0.982569410055046, 
2.99101676001009, 6.18138991672881, 9.76022819874748, 29.5967452353405, 
48.1108278028274, 138.036371702049, 239.698521514589, 0.941243332895477, 
3.16458628408028, 5.72680306797355, 9.26431527283265, 29.0181801551066, 
44.887393784381, 141.342457874815, 237.218956885015, 1.07941778099747, 
3.36900393602722, 6.0686652233011, 10.6136646056736, 31.1174212178803, 
55.6364968333108, NA, 245.979704049963, 0.98544222985819, 3.3177445444967, 
5.60733069952645, 9.50304445584572, 32.6552029637958, 47.7767234652982, 
NA, 239.999441704736, 0.89146667871891, 2.8478667888003, 5.91488704870955, 
9.1613151789756, 27.7001284491792, 50.9377192763467, NA, 231.456209782983, 
0.887738051402174, 3.04188235451485, 5.9023034783202, 10.0163659588551, 
28.9092709123842, 48.5084526866061, 152.684283738776, 264.805729023739, 
1.02899341554071, 2.78585700701375, 5.89347501806154, 10.7226427795477, 
30.0569707460098, 51.5984137771366, 155.332821816374, 248.031654532288, 
1.09079263735132, 3.05071081477351, 6.45849647461568, 9.31008913816238, 
29.8804015408367, 48.7733064943658, 140.324439376654, 251.563038635751 
), QuadPred = c(NA, 3.46077095737974, 6.38659713413108, 10.1956079501556, 
31.4700369979564, 57.0089799611706, 157.775316006369, 268.303966059862, 
0.99289436409299, 2.96536517477853, 6.10198249392715, 9.62549220297933, 
29.2517496204359, 47.7196128593832, 139.600469198163, 248.272682787657, 
0.95232583127381, 3.13590297331348, 5.65480031033985, 9.13693141349813, 
28.6769820181676, 44.4936547741659, 143.050878627236, 245.555818447238, 
1.08417831830729, 3.33895371044810, 6.00044125019758, 10.4882228621509, 
30.8451526869812, 55.4331759085967, NA, 256.446833964951, 0.991679220421247, 
3.28844923081897, 5.54540949253351, 9.3907657095483, 32.3793538902883, 
47.5218142460371, NA, 249.828516445647, 0.899183876120787, 2.82554368740693, 
5.84875388286628, 9.05319326862309, 27.4395572248486, 50.7001828907023, 
NA, 240.411024762687, 0.884412915928806, 3.05257006009469, 5.93046554432476, 
10.0673979669, 29.0311859234644, 48.645035648271, 151.914544909710, 
261.273991566153, 1.02660962824666, 2.79491765184684, 5.92158513760114, 
10.7773327827008, 30.1813919027873, 51.7318741314584, 154.518856412401, 
245.027488125567, 1.08881969774848, 3.06145444119556, 6.48990638077339, 
9.35738460692028, 30.0044505131336, 48.9096796323938, 139.747394069421, 
248.451100154569)), .Names = c("Nominal", "Actual", "LinPred", 
"QuadPred"), row.names = c(NA, -72L), class = "data.frame") 

Respuesta

9

Estos son algunos paquetes complemento que podría ayudar (ver Quick-R)

Utilizando el Hmisc paquete

library(Hmisc) 

describe(mydata) 
# n, nmiss, unique, mean, 5,10,25,50,75,90,95th percentiles 
# 5 lowest and 5 highest scores 

Utilizando el pastecs paquete

library(pastecs) 

stat.desc(mydata) 
# nbr.val, nbr.null, nbr.na, min max, range, sum, 
# median, mean, SE.mean, CI.mean, var, std.dev, coef.var 

Utilizando el psych paquete

library(psych) 
describe(mydata) 
# item name ,item number, nvalid, mean, sd, 
# median, mad, min, max, skew, kurtosis, se 

Utilizaría describe.by del paquete psych;

> describe.by(biastable, as.factor(Nominal)) 
group: 1 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 1.00 0.00 1.00 1.00 0.00 1.00 1.00 0.00 NaN  NaN 0.00 
Actual  2 8 0.12 0.01 0.12 0.12 0.01 0.11 0.13 0.03 0.09 -1.47 0.00 
LinPred 3 8 0.99 0.08 0.98 0.99 0.10 0.89 1.09 0.20 0.04 -1.70 0.03 
QuadPred 4 8 0.99 0.08 0.99 0.99 0.10 0.88 1.09 0.20 -0.04 -1.64 0.03 
------------------------------------------------------------------------ 
group: 3 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 3.00 0.00 3.00 3.00 0.00 3.00 3.00 0.00 NaN  NaN 0.00 
Actual  2 9 0.37 0.03 0.36 0.37 0.03 0.32 0.42 0.10 0.15 -1.50 0.01 
LinPred 3 9 3.12 0.24 3.05 3.12 0.30 2.79 3.50 0.71 0.15 -1.52 0.08 
QuadPred 4 9 3.10 0.23 3.06 3.10 0.34 2.79 3.46 0.67 0.12 -1.51 0.08 
------------------------------------------------------------------------ 
group: 6 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 6.00 0.00 6.00 6.00 0.00 6.00 6.00 0.00 NaN  NaN 0.00 
Actual  2 9 0.71 0.04 0.70 0.71 0.04 0.66 0.78 0.12 0.46 -1.30 0.01 
LinPred 3 9 6.02 0.30 5.91 6.02 0.28 5.61 6.47 0.86 0.28 -1.43 0.10 
QuadPred 4 9 5.99 0.31 5.93 5.99 0.25 5.55 6.49 0.94 0.26 -1.26 0.10 
------------------------------------------------------------------------ 
group: 10 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 10.00 0.00 10.00 10.00 0.00 10.00 10.00 0.00 NaN  NaN 0.00 
Actual  2 9 1.16 0.07 1.14 1.16 0.09 1.06 1.25 0.19 0.09 -1.71 0.02 
LinPred 3 9 9.85 0.60 9.76 9.85 0.74 9.16 10.72 1.56 0.24 -1.76 0.20 
QuadPred 4 9 9.79 0.62 9.63 9.79 0.72 9.05 10.78 1.72 0.27 -1.65 0.21 
------------------------------------------------------------------------ 
group: 30 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 30.00 0.00 30.00 30.00 0.00 30.00 30.00 0.00 NaN  NaN 0.00 
Actual  2 9 3.53 0.22 3.51 3.53 0.21 3.25 3.85 0.60 0.23 -1.58 0.07 
LinPred 3 9 30.08 1.55 29.88 30.08 1.44 27.70 32.66 4.96 0.21 -1.27 0.52 
QuadPred 4 9 29.92 1.51 30.00 29.92 1.44 27.44 32.38 4.94 0.04 -1.22 0.50 
------------------------------------------------------------------------ 
group: 50 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 50.00 0.00 50.00 50.00 0.00 50.00 50.00 0.00 NaN  NaN 0.00 
Actual  2 9 5.91 0.51 5.82 5.91 0.43 5.43 6.94 1.51 0.90 -0.73 0.17 
LinPred 3 9 50.40 3.98 48.77 50.40 3.21 44.89 57.37 12.48 0.49 -1.16 1.33 
QuadPred 4 9 50.24 3.97 48.91 50.24 2.65 44.49 57.01 12.52 0.39 -1.21 1.32 
------------------------------------------------------------------------ 
group: 150 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 150.00 0.00 150.00 150.00 0.00 150.00 150.00 0.00 NaN  NaN 0.00 
Actual  2 6 17.23 0.97 17.20 17.23 0.67 15.90 18.80 2.90 0.25 -1.23 0.39 
LinPred 3 6 147.19 8.11 147.01 147.19 11.13 138.04 155.39 17.36 -0.01 -2.22 3.31 
QuadPred 4 6 147.77 7.95 147.48 147.77 10.95 139.60 157.78 18.17 0.07 -2.10 3.25 
------------------------------------------------------------------------ 
group: 250 
     var n mean sd median trimmed mad min max range skew kurtosis se 
Nominal 1 9 250.00 0.00 250.00 250.00 0.00 250.00 250.00 0.00 NaN  NaN 0.00 
Actual  2 9 28.83 1.18 28.70 28.83 0.89 27.10 31.20 4.10 0.59 -0.57 0.39 
LinPred 3 9 246.29 10.57 245.98 246.29 9.31 231.46 264.81 33.35 0.33 -1.26 3.52 
QuadPred 4 9 251.51 8.84 248.45 251.51 5.08 240.41 268.30 27.89 0.62 -1.04 2.95 
> 
3

¿Se puede usar algo como esto?

length(unique(x)) 
0

¿Se complete.cases (o sum(complete.cases)) hacer lo que quiera?

3

¿Qué son los "valores en blanco" y los "valores de texto"? Si tiene un vector numérico, entonces podría tener NA (is.na()), Inf (is.infinite()), NaN (is.nan()) y valores numéricos "válidos".

Para valores numéricos "válidos" (en el sentido anterior) se puede utilizar is.finite():

is.finite(c(1,NA,Inf,NaN)) 
# [1] TRUE FALSE FALSE FALSE 
sum(is.finite(c(1,NA,Inf,NaN))) 
# [1] 1 

Así colSums(is.numeric(x)) se podría hacer como colSums(is.finite(x)).

+0

+1 Anote exactamente lo que pidió el OP, pero exactamente lo que estaba buscando :-) – scraimer

6

colSums(!is.na(x)) debería funcionar.

Cuestiones relacionadas