Twitter network data mining with R

@dzidorius (Dzidas)

fork code from
https://github.com/kafka399/haxogreen.lu

Prerequisite for R

requirements=c('twitteR','tm','wordcloud','RColorBrewer','xtable')  
if(length(requirements[which(!(requirements %in% 
    installed.packages()[,1]))])>0)  
    install.packages(requirements[which(!(requirements %in% 
    installed.packages()[,1]))])

Get the data

Who is tweeting about #Haxogreen

require(twitteR)
# tweets=twitteR::searchTwitter('#haxogreen',n=2000)
load("tweets.Rdata")
names = sapply(tweets, function(x) x$screenName)
rez = (aggregate(names, list(factor(names)), length))
rez = rez[order(rez$x), ]
# rownames(rez)=NULL
colnames(rez) = c("name", "count")
# only for presentation
options(xtable.type = "html")
require(xtable)
xtable(t(tail(rez, 6)))
23 47 3 28 7 56
name HoffmannMich psycon anachorete kwisArts bobreuter syn2cat
count 8 9 10 13 15 44

Plot top10 tweeters

barplot(tail(rez$count, 10), names.arg = as.character(tail(rez$name, 
    10)), cex.names = 0.7, las = 2)

Who are these people

# users=lookupUsers(as.character(tail(rez,3)$name))
load("users.Rdata")
users = lapply(users, function(x) {
    if (any(sapply(as.character(tail(rez, 3)$name), function(y) {
        x$screenName == y
    }))) 
        return(x)
})

users = users[!sapply(users, is.null)]

for (i in 1:length(users)) {
    cat(paste("**", users[[i]]$name, " @", users[[i]]$screenName, "**", "\n===\n", 
        sep = ""))
    cat(paste("![](http://api.twitter.com/1/users/profile_image?screen_name=", 
        users[[i]]$screenName, "&size=normal)", sep = ""))
    cat(paste("  \n**Created:** ", users[[i]]$created, "  \n**Spam rate:** ", 
        round(users[[i]]$followersCount/users[[i]]$friendsCount, digits = 2), 
        "  \n**Activity:** ", users[[i]]$statusesCount, "  \n**Location:** ", 
        users[[i]]$location, "  \n", users[[i]]$description, "  \n", "**Last status:** ", 
        (users[[i]]$lastStatus$text), "\n\n", sep = ""))
}

Bob Reuter @bobreuter


Created: 2008-04-01 12:43:44
Spam rate: 1.03
Activity: 567
Location: Luxembourg
I’m a Senior Lecturer in Educational Technology & Psychology @ University of Luxembourg.
Last status: @rodlux there are toilets! Used them yesterday! and I can confirm the presence of showers!!! just saw somebody freshly showered! #haxogreen

David Raison @kwisArts


Created: 2009-07-15 14:30:25
Spam rate: 0.91
Activity: 7138
Location: Luxembourg
cynic, founder at @syn2cat hackerspace / political science graduate / media science student
Last status: RT @foobareV: Damit wir auf der #haxogreen die Orientierung nicht verlieren… http://t.co/UB20qhjH

syn2cat a.s.b.l @syn2cat


Created: 2009-07-11 17:40:00
Spam rate: 0.67
Activity: 5994
Location: Luxembourg
syn2cat is a hackerspace located in Strassen, Luxembourg. Tweets from this account are brought to you by @SteveClement, @kwisArts and @tdegeling.
Last status: RT @GuyFoetz: Good morning folks at #haxogreen ready to hack? =)

Exploring similarities - code

## of line version users=lookupUsers(as.character(tail(rez,10)$name))
load("users.Rdata")

# friends=sapply(seq(1:10),function(x){(users[[x]]$getFriends())})
load("friends.Rdata")
matrix = matrix(nrow = 10, ncol = 10)
rez = apply(combn(1:10, 2), 2, function(x) {
    
    matrix[x[1], x[2]] = length(which(sapply(friends[[x[2]]], function(x) x$screenName) %in% 
        sapply(friends[[x[1]]], function(x) x$screenName)))/length(friends[[x[2]]])
})

rez3 = apply(combn(1:10, 2), 2, function(x) {
    
    matrix[x[1], x[2]] = length(which(sapply(friends[[x[1]]], function(x) x$screenName) %in% 
        sapply(friends[[x[2]]], function(x) x$screenName)))/length(friends[[x[1]]])
})

# rez2=combn(1:10,2)[,which(rez>.2)]

pairs = data.frame(pairs = apply(combn(1:10, 2)[, which(rez > 0.4)], 
    2, function(x) {
        paste(users[[x[1]]]$name, "~", users[[x[2]]]$name)
    }), rate = rez[which(rez > 0.4)])

pairs = rbind(pairs, data.frame(pairs = apply(combn(1:10, 2)[, which(rez3 > 
    0.4)], 2, function(x) {
    paste(users[[x[1]]]$name, "~", users[[x[2]]]$name)
}), rate = rez3[which(rez3 > 0.4)]))

Exploring similarities

xtable(pairs[order(pairs$rate), ])
pairs rate
1 Michel Hoffmann ~ Pit Wenkin 0.44
4 Pit Wenkin ~ David Raison 0.45
3 Michel Hoffmann ~ syn2cat a.s.b.l 0.49
5 Pit Wenkin ~ syn2cat a.s.b.l 0.54
2 Michel Hoffmann ~ Thierry Degeling 0.58
8 David Raison ~ syn2cat a.s.b.l 0.60
6 Thierry Degeling ~ David Raison 0.67
7 Thierry Degeling ~ syn2cat a.s.b.l 0.73

Exploring dissimilarities - code

pairs = data.frame(pairs = apply(combn(1:10, 2)[, which(rez < 0.007)], 
    2, function(x) {
        paste(users[[x[1]]]$name, "~", users[[x[2]]]$name)
    }), rate = rez[which(rez < 0.007)])

pairs = rbind(pairs, data.frame(pairs = apply(combn(1:10, 2)[, which(rez3 < 
    0.007)], 2, function(x) {
    paste(users[[x[1]]]$name, "~", users[[x[2]]]$name)
}), rate = rez3[which(rez3 < 0.007)]))

Exploring dissimilarities

xtable(pairs[order(pairs$rate), ])
pairs rate
11 anachorete ~ Dzidas 0.00
2 Tijmen Leroi ~ anachorete 0.00
7 Thierry Degeling ~ anachorete 0.00
3 Tijmen Leroi ~ Bob Reuter 0.01
6 psy ~ Bob Reuter 0.01
1 Tijmen Leroi ~ psy 0.01
4 Pit Wenkin ~ psy 0.01
9 psy ~ Dzidas 0.01
10 psy ~ Bob Reuter 0.01
5 psy ~ anachorete 0.01
8 Michel Hoffmann ~ Tijmen Leroi 0.01

Word cloud - code

# tweets=twitteR::searchTwitter('#haxogreen',n=2000)
load("tweets.Rdata")
tweets = sapply(tweets, function(x) x$text)
tweets = tweets[which(!unlist(sapply(seq(2:length(tweets)), function(x) {
    tweets[x] == tweets[x - 1]
}))) + 1]
require("tm")
corpus = Corpus(VectorSource(tweets), readerControl = list(language = "en"))
corpus = tm_map(corpus, tolower)
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeWords, c(stopwords("english"), "rt"))

corpus.matrix = TermDocumentMatrix(corpus)
corpus = removeSparseTerms(corpus.matrix, 0.75)

rez = sort(rowSums(as.matrix(corpus.matrix)))
rez = data.frame(name = names(rez), count = as.numeric(rez))
rez = rez[-which(rez$name == "haxogreen"), ]

Word cloud

require(RColorBrewer)
require(wordcloud)
wordcloud(rez$name, rez$count, min.freq = 7, colors = brewer.pal(8, 
    "Dark2"))

Who Am I?

Dzidorius Martinaitis

Java, C++ and R developer & data junkie