library(tidyverse) # Daten + Plots
dim(df) # Zeilen x Spalten str(df) # Datentypen summary(df) # Min, Median, Mean, Max head(df, 5) # Erste 5 Zeilen
df %>% # Pipe-Operator filter(preis > 50) %>% # Zeilen filtern mutate(marge = umsatz / kosten) %>% # Spalte berechnen group_by(marke) %>% # Gruppieren summarize(m = mean(umsatz), # Aggregieren s = sd(umsatz), n = n()) %>% arrange(desc(m)) # Sortieren
ggplot(df, aes(x, y)) + geom_point() + geom_smooth(method="lm") # Scatter + Trend ggplot(df, aes(x)) + geom_histogram(bins=20) # Verteilung ggplot(df, aes(g, y)) + geom_boxplot() # Gruppenvergleich
mod <- lm(umsatz ~ werbung, data=df) # Modell schätzen summary(mod) # Koeff, SE, p, R² coef(mod) # Koeffizienten confint(mod) # 95%-KI
# Mehrere Prädiktoren mod <- lm(y ~ x1 + x2 + x3, data=df) # Additiv # Interaktion: Effekt von x1 hängt von g ab mod <- lm(y ~ x1 * g, data=df) # Mit Interaktion
plot(fitted(mod), residuals(mod)) # Residuen-Plot abline(h=0, col="red") # Nulllinie qqnorm(residuals(mod)) # Normalvert. prüfen qqline(residuals(mod), col="red") # Referenzlinie
set.seed(42) # Reproduzierbar idx <- sample(1:nrow(df), 0.8*nrow(df)) # Zufällige 80% train <- df[idx, ] # Trainingsset test <- df[-idx, ] # Testset pred <- predict(mod, newdata=test) # Vorhersage rmse <- sqrt(mean((test$y - pred)^2)) # Fehler berechnen
library(tidyverse) # Daten + Plots library(infer) # Resampling-Tools
boot <- df %>% specify(response = umsatz) %>% # Zielvariable generate(reps=1000, type="bootstrap") %>% # 1000 Stichproben calculate(stat = "mean") # Mittelwert berechnen ci <- boot %>% get_confidence_interval(level=0.95) # 95%-KI boot %>% visualize() + # Bootstrap-Verteilung shade_confidence_interval(ci) # KI einzeichnen
obs <- df %>% specify(y ~ gruppe) %>% # Formel angeben calculate(stat="diff in means", order=c("A","B")) # Beob. Differenz null <- df %>% specify(y ~ gruppe) %>% hypothesize(null="independence") %>% # H0: kein Unterschied generate(reps=1000, type="permute") %>% # Permutieren calculate(stat="diff in means", order=c("A","B")) # Null-Verteilung null %>% get_p_value(obs, direction="two-sided") # p-Wert null %>% visualize() + shade_p_value(obs) # Ergebnis zeigen
t.test(y ~ gruppe, data=df) # 2 Gruppen t.test(x, mu=7) # 1 Stichprobe anova <- aov(y ~ gruppe, data=df) # 3+ Gruppen summary(anova) # Ergebnis TukeyHSD(anova) # Post-hoc
replicate(1000, { # 1000 Simulationen a <- rnorm(50); b <- rnorm(50) # H0 wahr: gleich t.test(a, b)$p.value < 0.05 # Signifikant? }) %>% mean() # ≈ 0.05
p <- c(0.01, 0.04, 0.03, 0.15, 0.08) # Rohe p-Werte p.adjust(p, method="bonferroni") # konservativ p.adjust(p, method="BH") # FDR
cohens_d <- function(x, y) { # Effektstärke sp <- sqrt(((length(x)-1)*sd(x)^2 + # Pooled SD (length(y)-1)*sd(y)^2) / (length(x)+length(y)-2)) (mean(x) - mean(y)) / sp # Stand. Differenz } # 0.2 klein, 0.5 mittel, 0.8 gross
ev <- p * gain + (1-p) * loss # Erwartungswert breakeven <- -loss / (gain - loss) # Break-even P
library(car) # VIF library(glmnet) # Lasso/Ridge library(caret) # Kreuzvalidierung
cor(df[, c("x1","x2","x3")]) # Korrelationsmatrix mod <- lm(y ~ x1 + x2, data=df) # Regression fitten vif(mod) # >5 problematisch, >10 schwer
# Underfitting → Overfitting mod <- lm(y ~ poly(x, 3), data=train) # Polynom Grad 3 pred <- predict(mod, test) # Vorhersage mse <- mean((test$y - pred)^2) # Testfehler # Optimaler Grad: niedrigster Test-MSE
X <- as.matrix(df[, c("x1","x2","x3")]) # Feature-Matrix fit <- glmnet(X, y, alpha=1) # Lasso cv <- cv.glmnet(X, y, alpha=1) # CV coef(cv, s="lambda.min") # Beste Koeff coef(cv, s="lambda.1se") # Sparsamere
fit <- glmnet(X, y, alpha=0) # Ridge # Lasso → Koeff = 0 (Selektion) # Ridge → Koeff ≈ 0 (Schrumpfung)
ctrl <- trainControl(method="cv", number=10) # 10-fold CV mod <- train(y ~ ., data=df, # Modell trainieren method="lm", trControl=ctrl) mod$results$RMSE # CV-RMSE mod$results$Rsquared # CV-R²
mod <- glm(churn ~ alter + beschwerden, # Logit-Modell data=df, family=binomial) exp(coef(mod)) # Odds Ratios exp(confint(mod)) # OR mit KI prob <- predict(mod, type="response") # Wahrscheinlichkeit pred <- ifelse(prob > 0.5, 1, 0) # Klasse vorhersagen
tab <- table(Predicted=pred, Actual=y) # Konfusionsmatrix acc <- (TP+TN) / (TP+TN+FP+FN) # Accuracy pre <- TP / (TP+FP) # Precision rec <- TP / (TP+FN) # Recall
# n pro Gruppe für 80% Power: n <- ceiling(16 / d^2) # Stichprobengrösse # d=0.2 → n=400, d=0.5 → n=64, d=0.8 → n=25