import math
import random
import matplotlib.pyplot as plt

import math
import random
import matplotlib.pyplot as plt
%matplotlib inline

def plot_impurity_candidate(f, title="Candidate impurity function"):
    ps = [i/100 for i in range(101)]
    vals = [f(p) for p in ps]
    plt.figure(figsize=(7, 4))
    plt.plot(ps, vals, 'b-', linewidth=2)
    plt.xlabel('p (fraction of Red)')
    plt.ylabel('impurity')
    plt.title(title)
    plt.grid(True)
    plt.show()

# PLOTTING HELPER — run this cell as-is
def plot_impurity_candidate(f, title="Candidate impurity function"):
    ps = [i/100 for i in range(101)]
    vals = [f(p) for p in ps]
    plt.figure(figsize=(7, 4))
    plt.plot(ps, vals, 'b-', linewidth=2)
    plt.xlabel('p (fraction of Red)')
    plt.ylabel('value')
    plt.title(title)
    plt.grid(True)
    plt.show()

# YOUR CODE HERE
# Step 1: Print the table
print(f"{'p':<6} {'1-p':<6} {'p*(1-p)':<10}")
print("-" * 24)
for p in [0.0, 0.2, 0.4, 0.5, 0.6, 0.8, 1.0]:
    pass  # fill in

# Step 2: Plot it
# plot_impurity_candidate(lambda p: p * (1 - p), title="f(p) = p * (1-p)")

# YOUR CODE HERE
def impurity(p, q):
    """
    Returns the impurity of a set with fraction p of Red and q of Black.
    impurity(0.5, 0.5) should return 1  (most mixed = least certain)
    impurity(1, 0)     should return 0  (all same  = most certain)
    impurity(0, 1)     should return 0
    """
    pass  # replace this


# Verify the spec:
print(impurity(0.5, 0.5))  # should be 1
print(impurity(0, 1))      # should be 0
print(impurity(1, 0))      # should be 0
print(impurity(0.8, 0.2))  # should be between 0 and 1

# Verify ordering:
print(impurity(0.8, 0.2) < impurity(0.7, 0.3))  # should be True
print(impurity(0.7, 0.3) < impurity(0.6, 0.4))  # should be True

# Plot your impurity function
plot_impurity_candidate(lambda p: impurity(p, 1 - p), title="My impurity function")

# YOUR CODE HERE
bags = [
    ("A", 0.9, 0.1),
    ("B", 0.7, 0.3),
    ("C", 0.6, 0.4),
    ("D", 0.5, 0.5),
    ("E", 0.3, 0.7),
    ("F", 1.0, 0.0),
]

print(f"{'Bag':<5} {'p':<6} {'q':<6} {'impurity':>10}")
print("-" * 30)
for name, p, q in bags:
    imp = impurity(p, q)
    print(f"{name:<5} {p:<6.1f} {q:<6.1f} {imp:>10.4f}")

["R", "B", "B", "R", "R"]

# YOUR CODE HERE
def list_impurity(items):
    """
    Computes the impurity of a list of "R" and "B" elements.
    Returns a value between 0 (pure/certain) and 1 (maximally mixed).
    """
    pass  # replace this


# Test cases
test_cases = [
    ["R", "B", "B", "R", "R"],
    ["R", "R", "R", "R"],
    ["B", "B", "B", "B"],
    ["R", "B"],
    [],
]

for lst in test_cases:
    print(f"{str(lst):<40}  impurity = {list_impurity(lst):.4f}")

# Ranking exercise
list_A = ["R", "R", "R", "R", "R", "R", "R", "R", "B", "B"]
list_B = ["R", "R", "R", "R", "R", "B", "B", "B", "B", "B"]
list_C = ["R", "R", "R", "R", "R", "R", "R", "B", "B", "B"]

for name, lst in [("A", list_A), ("B", list_B), ("C", list_C)]:
    print(f"List {name}: impurity = {list_impurity(lst):.4f}")

# Which is most mixed? Least mixed?

total_impurity(["R", "R", "R"], ["B", "B", "B"])  # perfectly separated
total_impurity(["R", "B", "R"], ["B", "R", "B"])  # each group is mixed
total_impurity(["R", "R", "R", "R", "R", "R", "R", "R", "R", "B"], ["B"])  # unequal sizes

# YOUR CODE HERE
def total_impurity(left, right):
    """
    Returns the weighted total impurity of two groups.
    Each group's impurity is weighted by its size relative to the combined total.
    """
    pass  # replace this


# Test cases
print(total_impurity(["R", "R", "R"], ["B", "B", "B"]))  # perfectly separated → expect high certainty
print(total_impurity(["R", "B", "R"], ["B", "R", "B"]))  # both mixed
print(total_impurity(["R"]*9 + ["B"], ["B"]))            # unequal sizes

items = ["R", "B", "B", "R", "R"]

# YOUR CODE HERE
items = ["R", "B", "B", "R", "R"]

print(f"{'k':<5} {'left':<25} {'right':<25} {'total impurity':>15}")
print("-" * 70)

for k in range(1, len(items)):
    left = items[:k]
    right = items[k:]
    imp = total_impurity(left, right)
    print(f"{k:<5} {str(left):<25} {str(right):<25} {imp:>15.4f}")

["R", "B", "B", "R", "R"]
["R", "R", "R", "B", "B", "B"]
["B", "R", "B", "R", "B", "R"]
["R", "R", "R", "R", "B"]

# YOUR CODE HERE
def find_best_split_position(items):
    """
    Finds the split position k that minimises total_impurity(items[:k], items[k:]).
    Returns (best_k, min_impurity).
    """
    pass  # replace this


# Test
test_lists = [
    ["R", "B", "B", "R", "R"],
    ["R", "R", "R", "B", "B", "B"],
    ["B", "R", "B", "R", "B", "R"],
    ["R", "R", "R", "R", "B"],
]

for lst in test_lists:
    best_k, min_imp = find_best_split_position(lst)
    print(f"List: {lst}")
    print(f"  Best k={best_k}: left={lst[:best_k]}, right={lst[best_k:]}")
    print(f"  Min impurity: {min_imp:.4f}")
    print()

def plot_split_scores(items, title=None):
    """
    Plots total_impurity vs split position k for a given list.
    Marks the best (minimum impurity) position.
    """
    ks = list(range(1, len(items)))
    scores = [total_impurity(items[:k], items[k:]) for k in ks]
    best_k = ks[scores.index(min(scores))]

    plt.figure(figsize=(8, 4))
    plt.plot(ks, scores, 'b-o')
    plt.axvline(x=best_k, color='red', linestyle='--', label=f'best k={best_k}')
    plt.xlabel('Split position k')
    plt.ylabel('Total impurity')
    plt.title(title or f'Split scores for {items}')
    plt.legend()
    plt.grid(True)
    plt.show()

# PLOTTING HELPER — run this cell as-is
def plot_split_scores(items, title=None):
    ks = list(range(1, len(items)))
    scores = [total_impurity(items[:k], items[k:]) for k in ks]
    best_k = ks[scores.index(min(scores))]
    plt.figure(figsize=(8, 4))
    plt.plot(ks, scores, 'b-o')
    plt.axvline(x=best_k, color='red', linestyle='--', label=f'best k={best_k}')
    plt.xlabel('Split position k')
    plt.ylabel('Total impurity')
    plt.title(title or f'Split scores for {items}')
    plt.legend()
    plt.grid(True)
    plt.show()

# YOUR CODE HERE — plot all four lists
for lst in test_lists:
    plot_split_scores(lst)

# DATASET — run this cell as-is
random.seed(42)

heights = [158, 162, 165, 167, 168, 170, 171, 172, 174, 175,
           176, 178, 179, 180, 181, 182, 183, 185, 187, 190]
genders = ["F",  "F",  "F",  "F",  "M",  "F",  "M",  "M",  "F",  "M",
           "M",  "M",  "M",  "M",  "F",  "M",  "M",  "M",  "M",  "M"]

print("Height  Gender")
print("-" * 16)
for h, g in zip(heights, genders):
    print(f"{h:<8} {g}")

# PLOTTING HELPER — run this cell as-is
def plot_height_gender(heights, genders, split_height=None, title="Height vs Gender"):
    """
    Scatter plot of height vs gender.
    Optionally draws a vertical split line at split_height.
    """
    colors = {"M": "blue", "F": "red"}
    y_jitter = {"M": 1, "F": 0}
    plt.figure(figsize=(10, 3))
    for h, g in zip(heights, genders):
        plt.scatter(h, y_jitter[g], color=colors[g], s=80, zorder=5)
        plt.text(h, y_jitter[g] + 0.05, g, ha='center', fontsize=8)
    if split_height is not None:
        plt.axvline(x=split_height, color='green', linestyle='--', linewidth=2,
                    label=f'split at {split_height}')
        plt.legend()
    plt.yticks([0, 1], ["F", "M"])
    plt.xlabel("Height (cm)")
    plt.title(title)
    plt.grid(axis='x')
    plt.tight_layout()
    plt.show()

# Plot the data
plot_height_gender(heights, genders)

# YOUR CODE HERE
def split_by_threshold(heights, genders, t):
    """
    Splits the dataset at threshold t on height.
    Returns (left_genders, right_genders) where:
      left_genders  = genders where height <= t
      right_genders = genders where height > t
    """
    pass  # replace this


# Test manually
left, right = split_by_threshold(heights, genders, 172)
print("Left (height <= 172):", left)
print("Right (height > 172):", right)
print("Total impurity:", total_impurity(left, right))

# Test multiple thresholds
thresholds = [160, 165, 170, 172, 175, 178, 182, 185, 188]

print(f"{'Threshold':<12} {'Left size':<12} {'Right size':<12} {'Total impurity':>15}")
print("-" * 55)

for t in thresholds:
    left, right = split_by_threshold(heights, genders, t)
    imp = total_impurity(left, right)
    print(f"{t:<12} {len(left):<12} {len(right):<12} {imp:>15.4f}")

# YOUR CODE HERE
def find_best_threshold(heights, genders):
    """
    Finds the height threshold that minimises total_impurity.
    Returns (best_threshold, min_impurity).
    """
    pass  # replace this


best_t, best_imp = find_best_threshold(heights, genders)
print(f"Best threshold: height <= {best_t}")
print(f"Minimum total impurity: {best_imp:.4f}")

left, right = split_by_threshold(heights, genders, best_t)
print(f"Left group:  {left}")
print(f"Right group: {right}")

plot_height_gender(heights, genders, split_height=best_t)

# PLOTTING HELPER — run this cell as-is
def plot_threshold_scores(heights, genders, title="Impurity vs Threshold"):
    """
    Plots total impurity for each unique height threshold.
    """
    unique_heights = sorted(set(heights))
    scores = []
    for t in unique_heights:
        left = [g for h, g in zip(heights, genders) if h <= t]
        right = [g for h, g in zip(heights, genders) if h > t]
        if left and right:
            scores.append((t, total_impurity(left, right)))

    ts, imps = zip(*scores)
    best_t = ts[imps.index(min(imps))]

    plt.figure(figsize=(9, 4))
    plt.plot(ts, imps, 'b-o')
    plt.axvline(x=best_t, color='red', linestyle='--', label=f'best t={best_t}')
    plt.xlabel('Threshold (height cm)')
    plt.ylabel('Total impurity')
    plt.title(title)
    plt.legend()
    plt.grid(True)
    plt.show()

# YOUR CODE HERE — run the plot
plot_threshold_scores(heights, genders)

class DecisionTreeNode:
    def __init__(self, ...):
        # your attributes here
        pass

    def check(self, features):
        # if leaf: return decision
        # if boundary: go left or right depending on features[self.boundary]
        pass

# YOUR CODE HERE
class DecisionTreeNode:
    def __init__(self, decision=None, boundary=None, boundary_value=None, left=None, right=None):
        """
        A node in a decision tree.
        
        If decision is not None: this is a leaf node.
        Otherwise: this is a boundary node with boundary, boundary_value, left, right.
        """
        pass  # replace this

    def check(self, features):
        """
        Traverses the tree and returns the leaf's boolean decision.
        features: dict of {feature_name: value}
        Rule: go LEFT if features[boundary] <= boundary_value, else go RIGHT.
        """
        pass  # replace this


def YES():
    """Returns a leaf node that decides True (accept)."""
    pass


def NO():
    """Returns a leaf node that decides False (reject)."""
    pass

# Build the job offer tree
# Tree:
#   salary <= 1000 → NO
#   salary > 1000:
#       distance <= 40 → YES
#       distance > 40  → NO

job_tree = DecisionTreeNode(
    boundary="salary",
    boundary_value=1000,
    left=NO(),
    right=DecisionTreeNode(
        boundary="distance",
        boundary_value=40,
        left=YES(),
        right=NO()
    )
)

# Test the four cases from Exercise 5.1
test_jobs = [
    {"salary": 800,  "distance": 20, "expected": False},
    {"salary": 1200, "distance": 30, "expected": True},
    {"salary": 1500, "distance": 60, "expected": False},
    {"salary": 1000, "distance": 15, "expected": False},
]

print(f"{'Salary':<10} {'Distance':<12} {'Result':<10} {'Expected':<10} {'Match'}")
print("-" * 55)
for job in test_jobs:
    result = job_tree.check(job)
    match = "✓" if result == job["expected"] else "✗"
    print(f"{job['salary']:<10} {job['distance']:<12} {str(result):<10} {str(job['expected']):<10} {match}")

# YOUR CODE HERE — build the 3-level tree
job_tree_v2 = None  # replace with your tree

# Test cases
test_v2 = [
    {"salary": 1200, "distance": 30, "coffee": 1, "expected": True},
    {"salary": 1200, "distance": 30, "coffee": 0, "expected": False},
    {"salary": 1200, "distance": 60, "coffee": 1, "expected": False},
    {"salary": 900,  "distance": 10, "coffee": 1, "expected": False},
]

for job in test_v2:
    result = job_tree_v2.check(job)
    match = "✓" if result == job["expected"] else "✗"
    print(f"salary={job['salary']}, distance={job['distance']}, coffee={job['coffee']}  →  {result}  {match}")

# EXTENDED DATASET — run this cell as-is
data = [
    {"height": 158, "weight": 52, "gender": "F"},
    {"height": 162, "weight": 55, "gender": "F"},
    {"height": 165, "weight": 58, "gender": "F"},
    {"height": 167, "weight": 61, "gender": "F"},
    {"height": 168, "weight": 70, "gender": "M"},
    {"height": 170, "weight": 60, "gender": "F"},
    {"height": 171, "weight": 73, "gender": "M"},
    {"height": 172, "weight": 75, "gender": "M"},
    {"height": 174, "weight": 63, "gender": "F"},
    {"height": 175, "weight": 78, "gender": "M"},
    {"height": 176, "weight": 80, "gender": "M"},
    {"height": 178, "weight": 82, "gender": "M"},
    {"height": 179, "weight": 84, "gender": "M"},
    {"height": 180, "weight": 85, "gender": "M"},
    {"height": 181, "weight": 66, "gender": "F"},
    {"height": 182, "weight": 88, "gender": "M"},
    {"height": 183, "weight": 90, "gender": "M"},
    {"height": 185, "weight": 92, "gender": "M"},
    {"height": 187, "weight": 95, "gender": "M"},
    {"height": 190, "weight": 98, "gender": "M"},
]

print(f"{'Height':<8} {'Weight':<8} {'Gender'}")
print("-" * 24)
for row in data:
    print(f"{row['height']:<8} {row['weight']:<8} {row['gender']}")

# YOUR CODE HERE
def find_decision_boundary(data, features, label):
    """
    Finds the best (feature, threshold) split across all given features.
    Returns (best_feature, best_threshold, min_impurity).
    """
    pass  # replace this


best_feat, best_thresh, best_imp = find_decision_boundary(data, ["height", "weight"], "gender")
print(f"Best split: {best_feat} <= {best_thresh}")
print(f"Minimum total impurity: {best_imp:.4f}")

# What are the two groups?
left_group = [row["gender"] for row in data if row[best_feat] <= best_thresh]
right_group = [row["gender"] for row in data if row[best_feat] > best_thresh]
print(f"Left  ({best_feat} <= {best_thresh}): {left_group}")
print(f"Right ({best_feat} >  {best_thresh}): {right_group}")

# PLOTTING HELPER — run this cell as-is
def plot_feature_split(data, feature, label, split_value=None, title=None):
    """
    Plots a feature vs label. Optionally marks a split threshold.
    """
    color_map = {"M": "blue", "F": "red"}
    y_map = {"M": 1, "F": 0}

    plt.figure(figsize=(10, 3))
    for row in data:
        x = row[feature]
        y = y_map[row[label]]
        c = color_map[row[label]]
        plt.scatter(x, y, color=c, s=80, zorder=5)
        plt.text(x, y + 0.06, row[label], ha='center', fontsize=8)

    if split_value is not None:
        plt.axvline(x=split_value, color='green', linestyle='--', linewidth=2,
                    label=f'split at {split_value}')
        plt.legend()

    plt.yticks([0, 1], ["F", "M"])
    plt.xlabel(feature)
    plt.title(title or f"{feature} vs {label}")
    plt.grid(axis='x')
    plt.tight_layout()
    plt.show()

# YOUR CODE HERE — plot height split and weight split
# Use find_decision_boundary first on each feature separately

best_h, best_ht, imp_h = find_decision_boundary(data, ["height"], "gender")
best_w, best_wt, imp_w = find_decision_boundary(data, ["weight"], "gender")

print(f"Best height split: height <= {best_ht}, impurity = {imp_h:.4f}")
print(f"Best weight split: weight <= {best_wt}, impurity = {imp_w:.4f}")

plot_feature_split(data, "height", "gender", split_value=best_ht, title=f"Height split at {best_ht} (impurity={imp_h:.3f})")
plot_feature_split(data, "weight", "gender", split_value=best_wt, title=f"Weight split at {best_wt} (impurity={imp_w:.3f})")

heights = [158, 162, 165, 167, 168, 170, 171, 172, 174, 175,
           176, 178, 179, 180, 181, 182, 183, 185, 187, 190]
genders = ["F",  "F",  "F",  "F",  "M",  "F",  "M",  "M",  "F",  "M",
           "M",  "M",  "M",  "M",  "F",  "M",  "M",  "M",  "M",  "M"]

majority_label(["M", "M", "F", "M"])  # → "M"
majority_label(["F", "M"])            # → tie: your choice
is_pure(["M", "M", "M"])             # → True
is_pure(["M", "M", "F"])             # → False

# YOUR CODE HERE
def majority_label(labels):
    """
    Returns the most common label in the list.
    Document your tie-breaking rule here.
    """
    pass  # replace this


def is_pure(labels):
    """
    Returns True if all labels are the same.
    """
    pass  # replace this


# Tests
print(majority_label(["M", "M", "F", "M"]))  # M
print(majority_label(["F", "M"]))            # tie — what do you return?
print(is_pure(["M", "M", "M"]))             # True
print(is_pure(["M", "M", "F"]))             # False
print(is_pure([]))                          # edge case — what do you return?

# YOUR CODE HERE
def build_tree(data, features, label):
    """
    Recursively builds a decision tree.
    
    data     : list of dicts
    features : list of feature names to split on
    label    : the target column name
    
    Returns a DecisionTreeNode (either leaf or boundary).
    """
    labels = [row[label] for row in data]

    # Base case 1: empty data
    if len(data) == 0:
        pass  # return a leaf — but with what label?

    # Base case 2: pure or single element
    if is_pure(labels) or len(data) == 1:
        pass  # return a leaf

    # Recursive case
    best_feat, best_thresh, best_imp = find_decision_boundary(data, features, label)

    left_data  = [row for row in data if row[best_feat] <= best_thresh]
    right_data = [row for row in data if row[best_feat] >  best_thresh]

    # Safety: if the split doesn't separate anything, stop
    if len(left_data) == 0 or len(right_data) == 0:
        pass  # return a leaf

    left_tree  = build_tree(left_data,  features, label)
    right_tree = build_tree(right_data, features, label)

    return DecisionTreeNode(
        boundary=best_feat,
        boundary_value=best_thresh,
        left=left_tree,
        right=right_tree
    )

# Helper to print the tree
def print_tree(node, depth=0, label="ROOT"):
    """
    Prints the tree in an indented format.
    Implement this yourself!
    """
    indent = "  " * depth
    if node.decision is not None:  # leaf node
        print(f"{indent}[{label}] LEAF → {node.decision}")
    else:  # boundary node
        print(f"{indent}[{label}] {node.boundary} <= {node.boundary_value}?")
        print_tree(node.left,  depth + 1, label="YES (left)")
        print_tree(node.right, depth + 1, label="NO (right)")

# Build and print the tree
tree = build_tree(data, features=["height", "weight"], label="gender")
print_tree(tree)

# YOUR CODE HERE
correct = 0
print(f"{'Height':<8} {'Weight':<8} {'True':<8} {'Predicted':<12} {'Match'}")
print("-" * 50)

for row in data:
    predicted = tree.check(row)
    true_label = row["gender"]
    match = "✓" if predicted == true_label else "✗"
    if predicted == true_label:
        correct += 1
    print(f"{row['height']:<8} {row['weight']:<8} {true_label:<8} {str(predicted):<12} {match}")

print(f"\nAccuracy: {correct}/{len(data)} = {correct/len(data)*100:.1f}%")

class SimpleDecisionTree:
    def fit(self, X, y):
        """
        X : list of dicts (each dict is one row of features)
        y : list of labels (strings or booleans)
        
        Builds the decision tree and stores it in self.root.
        Also stores the feature names in self.features.
        """
        pass

    def predict(self, X):
        """
        X : list of dicts
        Returns a list of predicted labels, one per row.
        """
        pass

# YOUR CODE HERE
class SimpleDecisionTree:
    def __init__(self):
        self.root = None
        self.features = None

    def fit(self, X, y):
        """
        X : list of dicts (features)
        y : list of labels
        """
        pass  # replace this

    def predict(self, X):
        """
        X : list of dicts
        Returns a list of predicted labels.
        """
        pass  # replace this

# Test the full pipeline
X = [{"height": row["height"], "weight": row["weight"]} for row in data]
y = [row["gender"] for row in data]

clf = SimpleDecisionTree()
clf.fit(X, y)
predictions = clf.predict(X)

# Accuracy
correct = sum(p == t for p, t in zip(predictions, y))
print(f"Training accuracy: {correct}/{len(y)} = {correct/len(y)*100:.1f}%")

# Predict on new data
new_people = [
    {"height": 163, "weight": 54},
    {"height": 185, "weight": 91},
    {"height": 172, "weight": 65},
]
new_preds = clf.predict(new_people)
print("\nNew predictions:")
for person, pred in zip(new_people, new_preds):
    print(f"  height={person['height']}, weight={person['weight']}  →  {pred}")

# YOUR CODE HERE
print_tree(clf.root)

# Challenge A — YOUR CODE
# Modify build_tree and SimpleDecisionTree to support max_depth

# Challenge B — YOUR CODE

# Load the iris dataset
from sklearn.datasets import load_iris
iris = load_iris()

# Convert to the format your tree expects
feature_names = iris.feature_names  # 4 features
X_iris = [dict(zip(feature_names, row)) for row in iris.data]
y_iris = [iris.target_names[t] for t in iris.target]  # "setosa", "versicolor", "virginica"

# Challenge C — YOUR CODE
from sklearn.datasets import load_iris

iris = load_iris()
feature_names = iris.feature_names
X_iris = [dict(zip(feature_names, row)) for row in iris.data]
y_iris = [iris.target_names[t] for t in iris.target]

clf_iris = SimpleDecisionTree()
clf_iris.fit(X_iris, y_iris)

preds = clf_iris.predict(X_iris)
acc = sum(p == t for p, t in zip(preds, y_iris)) / len(y_iris)
print(f"Iris training accuracy: {acc*100:.1f}%")

print("\nTree structure:")
print_tree(clf_iris.root)

# Challenge D — YOUR CODE

def gini_impurity(p, q):
    """
    Gini impurity for two classes.
    Formula: 1 - (p^2 + q^2)
    Note: Gini=0 when pure, Gini=0.5 when maximally mixed.
    """
    pass

def entropy_impurity(p, q):
    """
    Entropy impurity for two classes.
    Formula: -(p * log2(p) + q * log2(q))  [handle p=0 or q=0 as 0]
    """
    pass

# Plot all three impurity measures on the same graph
ps = [i/100 for i in range(101)]
plt.figure(figsize=(8, 4))
plt.plot(ps, [impurity(p, 1-p) for p in ps], label='Your formula')
plt.plot(ps, [gini_impurity(p, 1-p) for p in ps], label='Gini', linestyle='--')
plt.plot(ps, [entropy_impurity(p, 1-p) for p in ps], label='Entropy', linestyle=':')
plt.xlabel('p'); plt.ylabel('impurity'); plt.title('Comparing impurity measures')
plt.legend(); plt.grid(True)
plt.show()

Bag	Contents	p	q
A	9 Red, 1 Black	0.9	0.1
B	7 Red, 3 Black	0.7	0.3
C	6 Red, 4 Black	0.6	0.4
D	5 Red, 5 Black	0.5	0.5
E	3 Red, 7 Black	0.3	0.7
F	10 Red, 0 Black	1.0	0.0

salary	distance	coffee	Expected
1200	30	1	YES
1200	30	0	NO
1200	60	1	NO
900	10	1	NO

Learning Decision Trees by Inventing Them¶

What you'll need¶

PART 1: What Is Impurity?¶

Exercise 1.1 — Think About Mixing¶

Exercise 1.2 — Invent the Formula¶

Exercise 1.3 — Scale It¶

Exercise 1.4 — Explore the Function¶

PART 2: Impurity of a List¶

Exercise 2.1 — From Fractions to Lists¶

Exercise 2.2 — Weighted Impurity of Two Groups¶

PART 3: Finding the Best Split¶

Exercise 3.1 — Splitting a List at a Position¶

Exercise 3.2 — The Best Split Function¶

Exercise 3.3 — Visualise the Split Scores¶

PART 4: Splitting Real Data — Height & Gender¶

Exercise 4.1 — Meet the Dataset¶

Exercise 4.2 — Visualise the Data¶

Exercise 4.3 — Split by Height Threshold¶

Exercise 4.4 — Find the Best Threshold Automatically¶

Exercise 4.5 — Plot Impurity vs Threshold¶

PART 5: A Decision Node — The Job Offer¶

Exercise 5.1 — What Is a Decision Tree Node?¶

Exercise 5.2 — Build the DecisionTreeNode Class¶

Exercise 5.3 — A Bigger Tree¶

PART 6: Multiple Features — Finding the Best Split¶

Exercise 6.1 — Add a Second Feature: Weight¶

Exercise 6.2 — Find the Best Feature and Threshold¶

Exercise 6.3 — Visualise Both Features¶

PART 7: Growing the Tree — Recursive Splitting¶

Exercise 7.1 — What Happens After the First Split?¶

Exercise 7.2 — Stopping Conditions¶

Exercise 7.3 — Build the Tree Recursively¶

Exercise 7.4 — Use the Tree to Predict¶

PART 8: The Full Decision Tree — fit and predict¶

Exercise 8.1 — The Interface¶

Exercise 8.2 — Implement SimpleDecisionTree¶

Exercise 8.3 — Print the Learned Tree¶

PART 9: Bonus Challenges¶

Challenge A — Max Depth¶

Challenge B — Min Samples to Split¶

Challenge C — A New Dataset¶

Challenge D — What Is Gini Impurity?¶

Reflection — What Did You Just Build?¶

Exercise 5.2 — Build the `DecisionTreeNode` Class¶

PART 8: The Full Decision Tree — `fit` and `predict`¶

Exercise 8.2 — Implement `SimpleDecisionTree`¶