blogCorpus.R

Uploaded by:fwild

              
# expects as input a matrix of blog texts named 'texts'

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# create a corpus and textmatrix

corpus = Corpus(VectorSource(texts), readerControl=list(reader=readPlain, language="en", 
         load=TRUE ) )
corpus = tm_map(corpus, FUN=stripWhitespace)

corpus = tm_map(corpus, FUN=tolower)
#corpus = tm_map(corpus, FUN=as.PlainTextDocument)

#remove_phrases <- function (text) {
#  text = gsub("\""," ", text)
#  text
#}
#corpus = tm_map(corpus, FUN=remove_phrases)

#corpus2 = tm_map(corpus, stemDocument, language="english")
corpus2 = corpus

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# turn into a textmatrix

ctrl = list(weighting=weightTfIdf, stemming=TRUE, removePunctuation=TRUE, removeNumbers=TRUE, stopwords=TRUE, minWordLength=3, tokenize=MC_tokenizer, bounds=list(local = c(10,5000)))
tm = TermDocumentMatrix(corpus2, control=ctrl ) 
#tm = removeSparseTerms(tm, 0.98)

ctrl2 = ctrl = list(weighting=weightTfIdf, stemming=FALSE, removePunctuation=TRUE, removeNumbers=TRUE, stopwords=TRUE, minWordLength=3, tokenize=MC_tokenizer, bounds=list(local = c(10,5000)))
tm_unstemmed = TermDocumentMatrix(corpus2, control=ctrl2 ) 
#tm_unstemmed = removeSparseTerms(tm, 0.98)
dict = Dictionary(tm_unstemmed)
save(dict,file="telmap-blogs-dict.RData")

#if ( any(colSums(tm)==0) ) tm[,-which(colSums(tm)==0)]

# complete the stemmed terms
#rownames(tm) = stemCompletion(rownames(tm), corpus)
rownames(tm) = stemCompletion(rownames(tm), dict)

# this might have introduced empty terms and duplicates
if (anyDuplicated(rownames(tm))) tm = tm[!duplicated(rownames(tm)),]
tm = tm[(rownames(tm)!=""),]

rsums = rowSums(as.matrix(tm))
csums = colSums(as.matrix(tm))
if ( any(rsums==0)) tm = tm[!(rsums==0),]
if ( any(csums==0)) tm = tm[,!(csums==0)]

save(tm, file="telmap-blogs-tm.RData")

#inspect(tm[1:10,1:10])

# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  


# -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  -  
# latent semantic analysis

#lowf = which( freqs < 0.3 )
#highf = which( freqs > 10 )
#rf = c( lowf, highf )
#tm2 = tm[-(rf),]
#save(tm2,file="telmap-blogs-tm2reduced.RData")
#load("telmap-blogs-tm2reduced.RData")

space = lsa(tm,dims=dimcalc_raw())
save(space, file="telmap-blogs-space.RData")

varsk = space$sk^2/sum(space$sk^2)
varsk2 = NULL
for (i in 1:length(varsk)) {
  varsk2[i] = sum(varsk[1:i])
}

# find the factor for which the sum of the variances explained reaches 80%
dims80 = which(varsk2>0.8) [ 1 ]
spaceorig = space
space$tk = space$tk[,1:dims80]
space$dk = space$dk[,1:dims80]
space$sk = space$sk[1:dims80]