15-sentiment

Author
Affiliation

Prof Amanda Luby

Carleton College
Stat 220 - Spring 2025

library(tidyverse)
library(tidytext) # functions for doing text analysis

Load the Data

Random (?) sample of 26,882 reviews of coursera courses (Source: Kaggle)

en_coursera_reviews <- read_csv("https://stat220-s25.github.io/data/en_coursera_sample.csv")
en_coursera_reviews
# A tibble: 26,882 × 5
   CourseId                    Review                      Label cld2  review_id
   <chr>                       <chr>                       <dbl> <chr>     <dbl>
 1 nurture-market-strategies   It would be better if the …     1 en            1
 2 nand2tetris2                Superb course. Great prese…     5 en            2
 3 schedule-projects           Excellent course!               5 en            3
 4 teaching-english-capstone-2 I'd recommend this course …     5 en            4
 5 machine-learning            This course was so effecti…     5 en            5
 6 python-network-data         Words cannot describe how …     5 en            6
 7 clinical-trials             Great course!                   5 en            7
 8 python-genomics             I didn't know anything abo…     3 en            8
 9 strategic-management        Loved everything about thi…     5 en            9
10 script-writing              No significant instruction…     1 en           10
# ℹ 26,872 more rows

Load Sentiment data

“bing”

bing_sentiments = get_sentiments("bing") %>%
  slice_sample(n = 20)

“afinn”

# A tibble: 20 × 2
   word          value
   <chr>         <dbl>
 1 cheerful          2
 2 heroes            2
 3 debonair          2
 4 cocksucker       -5
 5 unequal          -1
 6 stunned          -2
 7 worth             2
 8 mischief         -1
 9 peacefully        2
10 exhausted        -2
11 cruelty          -3
12 roflcopter        4
13 passive          -1
14 ignored          -2
15 assassination    -3
16 rejoicing         4
17 murders          -2
18 exclude          -1
19 woohoo            3
20 approval          2

Sentiment of each review

bing_review_scores <- en_coursera_reviews %>%
  unnest_tokens(word, Review) %>% 
  inner_join(bing_sentiments, by = "word") %>%
  group_by(review_id) %>%
  summarize(
    sum = (sum(sentiment == "positive") - sum(sentiment == "negative"))
  )

bing_review_scores
# A tibble: 554 × 2
   review_id   sum
       <dbl> <int>
 1        31     1
 2        34     1
 3        62     1
 4        70     1
 5       109     1
 6       159     1
 7       206     1
 8       389     1
 9       422     1
10       475     1
# ℹ 544 more rows