Problem set 2

For these exercises, do not load any packages other than dslabs.

Make sure to use vectorization whenever possible (avoid loops unless explicitly allowed).

# Load only the allowed package
library(dslabs)

1

What is the sum of the first 150 positive integers? Use the functions seq and sum to compute the sum with R for any n.

# Step 1: define n
n <- 150

# Step 2: create the sequence 1, 2, ..., n
# x <- 

# Step 3: compute the sum of x
# ans <- 

# Step 4: print ans
# ans

2

Load the murders dataset from dslabs. Use the function str to examine the structure of the murders object.

What are the column names used by the data frame for these five variables: state name, abbreviation, region, population, total murders?
Show the subset of murders showing states with less than 1.2 per 100,000 deaths.
Show all variables.

# Examine structure
str(murders)

'data.frame':   51 obs. of  5 variables:
 $ state     : chr  "Alabama" "Alaska" "Arizona" "Arkansas" ...
 $ abb       : chr  "AL" "AK" "AZ" "AR" ...
 $ region    : Factor w/ 4 levels "Northeast","South",..: 2 4 4 2 4 4 1 2 2 2 ...
 $ population: num  4779736 710231 6392017 2915918 37253956 ...
 $ total     : num  135 19 232 93 1257 ...

# Step: print the column names
# names(murders)

# Step 1: compute murder rate per 100,000 into a vector called rate
# rate <- 

# Step 2: create a logical vector called idx for states with rate < 1.2
# idx <- 

# Step 3: subset murders using idx
# murders[idx, ]

3

Show the subset of murders showing states with less than 1.2 per 100,000 deaths and in the Northeast of the US. Do not show the region variable.

# Step 1: compute rate (or reuse from Q2)
# rate <- 

# Step 2: create logical vectors low and ne
# low <- 
# ne <- 

# Step 3: combine conditions into keep
# keep <- 

# Step 4: subset and remove region column
# out <- 
# out_no_region <- 

# out_no_region

4

Among states with a murder rate less than 1.2 per 100,000, show the smallest population state (show the state name, population, and rate).

# Step 1: compute rate
# rate <- 

# Step 2: restrict to low-rate states
# low <- 

# Step 3: find the row index of the smallest population among those
# Hint: use which(low) and which.min(...)
# i <- 

# Step 4: report as a small data frame with columns state, population, rate
# data.frame(...)

5

Show the state with a population of more than 8 million with the lowest murder rate (show the state name, population, and rate).

# Step 1: compute rate
# rate <- 

# Step 2: create logical vector big for population > 8 million
# big <- 

# Step 3: find the row index i of the smallest rate among big-pop states
# i <- 

# Step 4: report state, population, rate
# data.frame(...)

6

Compute the murder rate for each region of the US (total murders divided by total population times 100,000). Return a data frame with one row per region and columns region and rate.

# Step 1: compute total murders by region
# murders_by_region <- tapply( , , sum)

# Step 2: compute total population by region
# pop_by_region <- tapply( , , sum)

# Step 3: compute rates
# rate_by_region <- 

# Step 4: make a data frame with region names and rates
# region_rates <- data.frame(...)
# region_rates

7

Create a vector of numbers that starts at 5, does not pass 60, and adds numbers in increments of 3/8.

How many numbers does the list have?

# Step 1: define start, end, step
start <- 5
end <- 60
step <- 3/8

# Step 2: create the sequence v
# v <- 

# Step 3: show the first 6 values
# head(v)

# Step 4: show the length
# length(v)

8

Make this data frame:

temp_f <- c(72, 95, 41, 86, 78, 33)
city <- c("Chicago", "Lagos", "Oslo", "Rio de Janeiro", 
          "San Juan", "Toronto")
city_temps <- data.frame(name = city, temperature_f = temp_f)
city_temps

            name temperature_f
1        Chicago            72
2          Lagos            95
3           Oslo            41
4 Rio de Janeiro            86
5       San Juan            78
6        Toronto            33

Add a new column called temperature_c containing the temperatures in Celsius. Keep all existing columns.

# Step 1: compute Celsius using (F - 32) * 5/9
# city_temps$temperature_c <- 

# Step 2: print city_temps
# city_temps

9

Write a function euler2 that computes:

\[ S_n = 1 + \frac{1}{2^2} + \frac{1}{3^2} + \dots + \frac{1}{n^2}. \]

Test your function at n = 10 and n = 100.

# Step 1: write the function
euler2 <- function(n) {
  # Step: create k <- 1:n
  # Step: compute terms <- 1/(k^2)
  # Step: return sum(terms)
}

# Step 2: test
# euler2(10)
# euler2(100)

10

Plot \(S_n\) versus \(n\) for \(n = 1,2,\dots,2000\) with a horizontal dashed line at \(\pi^2/6\).

# Step 1: create n_vals <- 1:2000
# n_vals <- 

# Step 2: compute S for each n using sapply
# S <- 

# Step 3: plot
# plot(n_vals, S, type="l", xlab="n", ylab="S_n")

# Step 4: add dashed line at pi^2/6
# abline(h = , lty = 2)

11

Use %in% and state.abb to create a logical vector for: AL, AK, AZ, AR, AA.

test <- c("AL", "AK", "AZ", "AR", "AA")

# Step 1: create is_real
# is_real <- 

# Step 2: print is_real
# is_real

12

Report the one entry that is not an actual abbreviation.

test <- c("AL", "AK", "AZ", "AR", "AA")

# Step 1: create not_real using !
# not_real <- 

# Step 2: find index with which()
# idx <- 

# Step 3: print the entry
# test[idx]

13

Using %in%, show all variables for Florida, California, and New York, in that order.

targets <- c("Florida", "California", "New York")

# Step 1: subset murders to those states
# sub <- 

# Step 2: reorder rows using match()
# sub_ordered <- 

# Step 3: print sub_ordered
# sub_ordered

14

Write a function vander_helper(x, n) that returns \((1, x, x^2, \dots, x^n)\). Show results for x=2, n=6.

Restrictions: no loop.

vander_helper <- function(x, n) {
  # Step: create exponents 0:n
  # Step: return x^(0:n)
}

# Test:
# vander_helper(2, 6)

15

Create a vector using:

n <- 20000
p <- 0.35
set.seed(2025-9-18)
x <- sample(c(0,1), n, prob = c(1 - p, p), replace = TRUE)

Compute the length of each stretch of consecutive 1s (run lengths of 1s) and plot the distribution.

Do not use a loop.
Hint: use rle(x).

Then compare empirical proportions to the geometric prediction for run lengths 1 through 8.

# Step 1: compute r <- rle(x)
# r <- 

# Step 2: extract ones_lengths (lengths where values == 1)
# ones_lengths <- 

# Step 3: plot distribution (hist or barplot)
# hist(ones_lengths, breaks = 30)

# Step 4: empirical proportions for k=1:8
# tab <- table(ones_lengths)
# emp_counts <- as.numeric(tab[as.character(1:8)])
# emp_counts[is.na(emp_counts)] <- 0
# emp_probs <- emp_counts / length(ones_lengths)

# Step 5: theoretical probabilities (1-p)*p^(k-1)
# k <- 1:8
# theory_probs <- 

# Step 6: make a comparison data frame
# comparison <- data.frame(run_length = k, empirical_prob = emp_probs, theory_prob = theory_probs)
# comparison

16

In the murders dataset:

Compute the national average murder rate.
Create labels using ifelse:
- "High Crime, High Pop" if rate > national average and pop > 6 million
- "High Crime, Low Pop" if rate > national average and pop ≤ 6 million
- "Lower Crime" otherwise

Then show a table() of the labels.

# Step 1: state-level rate
# rate <- 

# Step 2: national average rate
# national_rate <- 

# Step 3: logical vectors high_crime and high_pop
# high_crime <- 
# high_pop <- 

# Step 4: labels using nested ifelse
# labels <- 

# Step 5: table(labels)
# table(labels)

17

What is the murder rate of the state that ranks 12th in terms of murder rate (from highest to lowest)?

Show your work using order (and optionally check with sort or rank).

# Step 1: rate vector
# rate <- 

# Step 2: ord <- order(rate, decreasing = TRUE)
# ord <- 

# Step 3: i <- ord[12]
# i <- 

# Step 4: report state and rate
# data.frame(state = murders$state[i], rate = rate[i])

18

Write a function compute_harmonic_mean that returns the harmonic mean of a numeric vector, but returns NA if any values are zero or negative. Test on c(1,2,4,8) and show it is about 2.133333.

compute_harmonic_mean <- function(x) {
  # Step 1: if any x <= 0, return NA
  # Step 2: compute n <- length(x)
  # Step 3: return n / sum(1/x)
}

# Test:
# compute_harmonic_mean(c(1, 2, 4, 8))

19

Create a function safe_divide(x, y) that returns x/y but returns "Cannot divide by zero" when y is zero. Make it work element-wise on vectors (vectorized). Test it on:

x <- c(10, 20, 30)
y <- c(2, 0, 5)

safe_divide <- function(x, y) {
  # Step 1: compute out <- x/y
  # Step 2: convert to character so you can store the message
  # Step 3: replace entries where y == 0
}

# Test:
# x <- c(10, 20, 30)
# y <- c(2, 0, 5)
# safe_divide(x, y)

20

Write a function classify_state_safety(state_name) that returns:

"Very Safe" if rate < 1
"Safe" if 1 ≤ rate < 3
"Moderate" if 3 ≤ rate < 5
"High Risk" if rate ≥ 5
"State not found" if the state is not in the dataset

Test on "Vermont", "Texas", "California", "NotAState".

Then use sapply to classify all states and use table() to count how many fall into each category.

# Step 1: compute a named vector of rates
# rate <- murders$total / murders$population * 100000
# rate_named <- setNames(rate, murders$state)

classify_state_safety <- function(state_name) {
  # Step 2: check if state_name is in names(rate_named)
  # Step 3: pull out r <- rate_named[state_name]
  # Step 4: return the correct label using if/else
}

# Tests:
# classify_state_safety("Vermont")
# classify_state_safety("Texas")
# classify_state_safety("California")
# classify_state_safety("NotAState")

# Step 5: classify all states with sapply
# cats <- sapply(murders$state, classify_state_safety)

# Step 6: count categories
# table(cats)

Convert to a PDF file

In a (Linux terminal), run the following command (install any missing packages on the fly)

# change path_to_hw2.qmd to something like hw/hw2.qmd if the current directory is a parent diretory of hw/

quarto render path_to_hw2.qmd --to pdf