## Warning in readLines("tm_test.txt"): 'tm_test.txt'에서 불완전한 마지막 행이
## 발견되었습니다
data1
## [1] "I like apple and banana , but hate cherry"
## [2] "I love banana , but not mango"
## [3] "I hate peach , but like cherry"
## [4] "I want to eat grape ~! "
class(data1)
## [1] "character"
tm 패키지 사용하기
library(tm)
## Warning: package 'tm' was built under R version 3.2.2
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.2.2
corp=Corpus(VectorSource(data1))#코퍼스는 tm패키지에서 처리하는 데이터의 유형corp#코퍼스 = 말뭉치를 의미함(4개의 문서가 들어있음)
corp2<-tm_map(corp,stripWhitespace)# 여러개의 공백을 하나의 공백으로 변환corp2<-tm_map(corp2,tolower)# 대문자가 있을 경우 소문자로 변환corp2<-tm_map(corp2,removeNumbers)# 숫자 제거corp2<-tm_map(corp2,removePunctuation)# 특수문자제거corp2<-tm_map(corp2,PlainTextDocument)sword2<-c(stopwords('en'),"and","but","not")# 기본 불용어 외에 불용어로 쓸 단어 추가corp2<-tm_map(corp2,removeWords,sword2)# 불용어 제거하기 (전치사 , 관사 등)tdm2<-TermDocumentMatrix(corp2)tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity : 66%
## Maximal term length: 6
## Weighting : term frequency (tf)
다시 텀다큐먼트메트릭스
tdm2<-TermDocumentMatrix(corp2) ; tdm2
## <<TermDocumentMatrix (terms: 11, documents: 4)>>
## Non-/sparse entries: 15/29
## Sparsity : 66%
## Maximal term length: 6
## Weighting : term frequency (tf)
## Warning: package 'igraph' was built under R version 3.2.2
##
## Attaching package: 'igraph'
##
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
##
## The following object is masked from 'package:base':
##
## union