Block 1: Grundlagen & Regression
Setup
library(tidyverse) # Daten + Plots
Daten erkunden
dim(df)          # Zeilen x Spalten
str(df)          # Datentypen
summary(df)      # Min, Median, Mean, Max
head(df, 5)      # Erste 5 Zeilen
Tidyverse Pipeline
df %>%  # Pipe-Operator
  filter(preis > 50) %>% # Zeilen filtern
  mutate(marge = umsatz / kosten) %>% # Spalte berechnen
  group_by(marke) %>% # Gruppieren
  summarize(m = mean(umsatz), # Aggregieren
            s = sd(umsatz),
            n = n()) %>%
  arrange(desc(m)) # Sortieren
ggplot2
ggplot(df, aes(x, y)) +
  geom_point() + geom_smooth(method="lm") # Scatter + Trend

ggplot(df, aes(x)) + geom_histogram(bins=20) # Verteilung
ggplot(df, aes(g, y)) + geom_boxplot() # Gruppenvergleich
Einfache Regression
mod <- lm(umsatz ~ werbung, data=df) # Modell schätzen
summary(mod)     # Koeff, SE, p, R²
coef(mod)        # Koeffizienten
confint(mod)     # 95%-KI
Multiple Regression
# Mehrere Prädiktoren
mod <- lm(y ~ x1 + x2 + x3, data=df) # Additiv

# Interaktion: Effekt von x1 hängt von g ab
mod <- lm(y ~ x1 * g, data=df) # Mit Interaktion
Diagnostik
plot(fitted(mod), residuals(mod)) # Residuen-Plot
abline(h=0, col="red") # Nulllinie
qqnorm(residuals(mod)) # Normalvert. prüfen
qqline(residuals(mod), col="red") # Referenzlinie
Train / Test Split
set.seed(42) # Reproduzierbar
idx <- sample(1:nrow(df), 0.8*nrow(df)) # Zufällige 80%
train <- df[idx, ] # Trainingsset
test  <- df[-idx, ] # Testset

pred <- predict(mod, newdata=test) # Vorhersage
rmse <- sqrt(mean((test$y - pred)^2)) # Fehler berechnen
Block 2: Inferenz & Unsicherheit
Setup
library(tidyverse) # Daten + Plots
library(infer) # Resampling-Tools
Bootstrap (infer)
boot <- df %>%
  specify(response = umsatz) %>% # Zielvariable
  generate(reps=1000, type="bootstrap") %>% # 1000 Stichproben
  calculate(stat = "mean") # Mittelwert berechnen

ci <- boot %>% get_confidence_interval(level=0.95) # 95%-KI
boot %>% visualize() + # Bootstrap-Verteilung
  shade_confidence_interval(ci) # KI einzeichnen
Permutationstest (infer)
obs <- df %>%
  specify(y ~ gruppe) %>% # Formel angeben
  calculate(stat="diff in means", order=c("A","B")) # Beob. Differenz

null <- df %>%
  specify(y ~ gruppe) %>%
  hypothesize(null="independence") %>% # H0: kein Unterschied
  generate(reps=1000, type="permute") %>% # Permutieren
  calculate(stat="diff in means", order=c("A","B")) # Null-Verteilung

null %>% get_p_value(obs, direction="two-sided") # p-Wert
null %>% visualize() + shade_p_value(obs) # Ergebnis zeigen
Traditionelle Tests
t.test(y ~ gruppe, data=df)   # 2 Gruppen
t.test(x, mu=7)               # 1 Stichprobe

anova <- aov(y ~ gruppe, data=df) # 3+ Gruppen
summary(anova) # Ergebnis
TukeyHSD(anova)              # Post-hoc
Typ-I-Fehler simulieren
replicate(1000, { # 1000 Simulationen
  a <- rnorm(50); b <- rnorm(50) # H0 wahr: gleich
  t.test(a, b)$p.value < 0.05 # Signifikant?
}) %>% mean()  # ≈ 0.05
Multiple Testing
p <- c(0.01, 0.04, 0.03, 0.15, 0.08) # Rohe p-Werte
p.adjust(p, method="bonferroni") # konservativ
p.adjust(p, method="BH")         # FDR
Cohen's d
cohens_d <- function(x, y) { # Effektstärke
  sp <- sqrt(((length(x)-1)*sd(x)^2 + # Pooled SD
         (length(y)-1)*sd(y)^2) /
        (length(x)+length(y)-2))
  (mean(x) - mean(y)) / sp # Stand. Differenz
}
# 0.2 klein, 0.5 mittel, 0.8 gross
Expected Value
ev <- p * gain + (1-p) * loss # Erwartungswert
breakeven <- -loss / (gain - loss) # Break-even P
Block 3: Komplexität & Generalisierung
Setup
library(car)     # VIF
library(glmnet)  # Lasso/Ridge
library(caret)   # Kreuzvalidierung
Multikollinearität / VIF
cor(df[, c("x1","x2","x3")])  # Korrelationsmatrix
mod <- lm(y ~ x1 + x2, data=df) # Regression fitten
vif(mod)  # >5 problematisch, >10 schwer
Bias-Varianz (Polynomial)
# Underfitting → Overfitting
mod <- lm(y ~ poly(x, 3), data=train) # Polynom Grad 3
pred <- predict(mod, test) # Vorhersage
mse <- mean((test$y - pred)^2) # Testfehler
# Optimaler Grad: niedrigster Test-MSE
Lasso Regression
X <- as.matrix(df[, c("x1","x2","x3")]) # Feature-Matrix
fit <- glmnet(X, y, alpha=1)    # Lasso
cv  <- cv.glmnet(X, y, alpha=1) # CV
coef(cv, s="lambda.min")       # Beste Koeff
coef(cv, s="lambda.1se")       # Sparsamere
Ridge Regression
fit <- glmnet(X, y, alpha=0)    # Ridge
# Lasso → Koeff = 0 (Selektion)
# Ridge → Koeff ≈ 0 (Schrumpfung)
Kreuzvalidierung (caret)
ctrl <- trainControl(method="cv", number=10) # 10-fold CV
mod <- train(y ~ ., data=df, # Modell trainieren
           method="lm", trControl=ctrl)
mod$results$RMSE      # CV-RMSE
mod$results$Rsquared  # CV-R²
Logistische Regression
mod <- glm(churn ~ alter + beschwerden, # Logit-Modell
         data=df, family=binomial)
exp(coef(mod))            # Odds Ratios
exp(confint(mod))          # OR mit KI
prob <- predict(mod, type="response") # Wahrscheinlichkeit
pred <- ifelse(prob > 0.5, 1, 0) # Klasse vorhersagen
Confusion Matrix
tab <- table(Predicted=pred, Actual=y) # Konfusionsmatrix
acc <- (TP+TN) / (TP+TN+FP+FN) # Accuracy
pre <- TP / (TP+FP)          # Precision
rec <- TP / (TP+FN)          # Recall
Sample Size / Power
# n pro Gruppe für 80% Power:
n <- ceiling(16 / d^2) # Stichprobengrösse
# d=0.2 → n=400, d=0.5 → n=64, d=0.8 → n=25