Variabili categoriali

Tabelle di frequenza: table(variabile).

Exploratory Data Analysis Using R – Datazar Blog

Why do we use exploratory graphs in data analysis?

Understand data properties
Find patterns in data
Suggest modeling strategies
“Debug” analyses

Da psych

Arabica

arabica <- read.csv("https://raw.githubusercontent.com/jldbc/coffee-quality-database/master/data/arabica_data_cleaned.csv")
dim(arabica)

## [1] 1311   44

# str(arabica)

Estraggo le variabili più interessanti

country <- arabica$Country.of.Origin
names(arabica)

##  [1] "X"                     "Species"               "Owner"                 "Country.of.Origin"    
##  [5] "Farm.Name"             "Lot.Number"            "Mill"                  "ICO.Number"           
##  [9] "Company"               "Altitude"              "Region"                "Producer"             
## [13] "Number.of.Bags"        "Bag.Weight"            "In.Country.Partner"    "Harvest.Year"         
## [17] "Grading.Date"          "Owner.1"               "Variety"               "Processing.Method"    
## [21] "Aroma"                 "Flavor"                "Aftertaste"            "Acidity"              
## [25] "Body"                  "Balance"               "Uniformity"            "Clean.Cup"            
## [29] "Sweetness"             "Cupper.Points"         "Total.Cup.Points"      "Moisture"             
## [33] "Category.One.Defects"  "Quakers"               "Color"                 "Category.Two.Defects" 
## [37] "Expiration"            "Certification.Body"    "Certification.Address" "Certification.Contact"
## [41] "unit_of_measurement"   "altitude_low_meters"   "altitude_high_meters"  "altitude_mean_meters"

valutazioni <- arabica[21:32]
names(valutazioni)

##  [1] "Aroma"            "Flavor"           "Aftertaste"       "Acidity"          "Body"            
##  [6] "Balance"          "Uniformity"       "Clean.Cup"        "Sweetness"        "Cupper.Points"   
## [11] "Total.Cup.Points" "Moisture"

colore <- arabica$Color
altitudine <- arabica$altitude_mean_meters
caffe <- data.frame(country,valutazioni,colore,altitudine)
dim(caffe)

## [1] 1311   15

# l'ultima riga è sporca
caffe <- caffe[-1311,]
# alcuni dati sull'altitudine sono sbagliati
caffe$altitudine[caffe$altitudine>3000] <- NA

head(caffe)

##     country Aroma Flavor Aftertaste Acidity Body Balance Uniformity Clean.Cup Sweetness Cupper.Points
## 1  Ethiopia  8.67   8.83       8.67    8.75 8.50    8.42         10        10        10          8.75
## 2  Ethiopia  8.75   8.67       8.50    8.58 8.42    8.42         10        10        10          8.58
## 3 Guatemala  8.42   8.50       8.42    8.42 8.33    8.42         10        10        10          9.25
## 4  Ethiopia  8.17   8.58       8.42    8.42 8.50    8.25         10        10        10          8.67
## 5  Ethiopia  8.25   8.50       8.25    8.50 8.42    8.33         10        10        10          8.58
## 6    Brazil  8.58   8.42       8.42    8.50 8.25    8.33         10        10        10          8.33
##   Total.Cup.Points Moisture       colore altitudine
## 1            90.58     0.12        Green       2075
## 2            89.92     0.12        Green       2075
## 3            89.75     0.00                    1700
## 4            89.00     0.11        Green       2000
## 5            88.83     0.12        Green       2075
## 6            88.83     0.11 Bluish-Green         NA

Che rapporto c'è fra l'altitudine della coltivazione e l'aroma?

lmAltAroma <- lm(caffe$Aroma~caffe$altitudine)
plot(caffe$Aroma~caffe$altitudine)
abline(lmAltAroma)

plot of chunk caffe--desc_caffe_corr

Il fattore country ha 37 livelli. Molti di questi, però, hanno meno di 10 osservazioni.

sort(table(caffe$country))

## 
##                                             Cote d?Ivoire                      Ecuador                        India 
##                            1                            1                            1                            1 
##                        Japan                    Mauritius             Papua New Guinea                       Rwanda 
##                            1                            1                            1                            1 
##                       Zambia                      Burundi                         Laos                       Panama 
##                            1                            2                            3                            4 
##  United States (Puerto Rico)                  Philippines                        Haiti                      Vietnam 
##                            4                            5                            6                            7 
##                      Myanmar                United States                         Peru                       Malawi 
##                            8                            8                           10                           11 
##                        China                    Indonesia                  El Salvador                        Kenya 
##                           16                           20                           21                           25 
##                    Nicaragua                       Uganda                     Thailand Tanzania, United Republic Of 
##                           26                           26                           32                           40 
##                     Ethiopia                   Costa Rica                     Honduras       United States (Hawaii) 
##                           44                           51                           52                           73 
##                       Taiwan                       Brazil                    Guatemala                     Colombia 
##                           75                          132                          181                          183 
##                       Mexico 
##                          236

Selezioniamo le osservazioni dai paesi con almeno 10 osservazioni.

r - Subset data frame based on number of rows per group - Stack Overflow

table_country <- table(caffe$country)
# solo le osservazioni con i paesi con almeno 100 osservazioni
caffe_top_country <- caffe[caffe$country %in% names(table_country[table_country > 100]), ]
# elimino i livelli oramai vuoti
caffe_top_country$country <- factor(caffe_top_country$country)
# pulisco degli outlier sporchi
caffe_top_country <- caffe_top_country[-which(caffe_top_country$Aroma<4),]
# la frequenza per paese
barplot(table(caffe_top_country$country))

plot of chunk caffe--barplot_country

boxplot(caffe_top_country$Aroma ~ caffe_top_country$country)

## Warning in min(x): nessun argomento non-mancante al minimo; si restituisce Inf

## Warning in max(x): nessun argomento non-mancante al massimo; si restituisce -Inf

## Error in plot.window(xlim = xlim, ylim = ylim, log = log, yaxs = pars$yaxs): i valori di 'ylim' devono essere finiti

plot of chunk caffe--boxplot_country

Con la funzione aggregate(v,by,fun) calcoliamo la media della variabile aroma per ognuno dei paesi top.

medie_aroma_top_country <- aggregate(caffe_top_country$Aroma,by=list(caffe_top_country$country), FUN=mean)

## Error in aggregate.data.frame(as.data.frame(x), ...): no rows to aggregate

medie_aroma_top_country

## Error in eval(expr, envir, enclos): oggetto 'medie_aroma_top_country' non trovato

Che rapporto c'è fra le varie valutazioni?

pairs.panels(caffe[2:5], 
             method = "pearson", # correlation method
             density = TRUE  # show density plots
             )

## Error in pairs.panels(caffe[2:5], method = "pearson", density = TRUE): non trovo la funzione "pairs.panels"

plot(caffe_top_country$Aroma~caffe_top_country$Flavor, 
		 col=caffe_top_country$country,
		 xlab="Flavor",
		 ylab="Aroma" 
		 )

## Warning in min(x): nessun argomento non-mancante al minimo; si restituisce Inf

## Warning in max(x): nessun argomento non-mancante al massimo; si restituisce -Inf

## Warning in min(x): nessun argomento non-mancante al minimo; si restituisce Inf

## Warning in max(x): nessun argomento non-mancante al massimo; si restituisce -Inf

## Error in plot.window(...): i valori 'xlim' devono essere finiti

plot of chunk caffe--aroma_flavor

legend('topleft',
			 legend = levels(caffe_top_country$country),
			 col=c(1,2,3,4), pch=1)

Descrittiva

Variabili categoriali

Da psych

Arabica

Cookies