In [1]:
import numpy as np
num_points = 2000
vector_set = []
for i in xrange(num_points) :
if np.random.random()>0.5 :
vector_set.append([np.random.normal(0.0,0.9),
np.random.normal(0.0,0.9)])
else :
vector_set.append([np.random.normal(3.0,0.5),
np.random.normal(3.0,0.5)] )
In [2]:
vector_set
Out[2]:
In [3]:
%matplotlib nbagg
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
df = pd.DataFrame({"x": [v[0] for v in vector_set],
"y": [v[1] for v in vector_set]})
sns.lmplot("x","y", data=df, fit_reg=False, size=6)
plt.show()
In [4]:
import tensorflow as tf
vectors = tf.constant(vector_set) # 생성한 난수를 상수텐서로 바꿈
k = 4 # 군집개수 지정
vectors.get_shape() # 텐서의 구조 확인
Out[4]:
In [5]:
centroides = tf.Variable(tf.slice(tf.random_shuffle(vectors),[0,0],[k,-1])) # 랜덤으로 데이터에서 k개 데이터 선택
centroides.get_shape()
Out[5]:
In [6]:
expanded_vectors = tf.expand_dims(vectors,0)
expanded_vectors.get_shape()
Out[6]:
In [7]:
expanded_centroides = tf.expand_dims(centroides,1)
expanded_centroides.get_shape()
Out[7]:
In [8]:
diff = tf.sub(expanded_vectors,expanded_centroides) # 중심 - 각x,y값을 뺀것
sqr = tf.square(diff) # diff 값을 제곱
distance = tf.reduce_sum(sqr,2) # 제곱한 값의 합
assignments = tf.argmin(distance,0) #거리의 합이 가장 작은 값의 인덱스(0차원)
In [9]:
diff.get_shape()
Out[9]:
In [10]:
sqr.get_shape()
Out[10]:
In [11]:
distance.get_shape()
Out[11]:
In [12]:
assignments.get_shape()
Out[12]:
In [13]:
#한줄로 하면 이렇게
assignments = tf.argmin(tf.reduce_sum(tf.square(tf.sub(expanded_vectors,expanded_centroides)),2),0)
#두줄 정도가 보기 좋으니 이렇게
distance = tf.reduce_sum(tf.square(tf.sub(expanded_vectors,expanded_centroides)),2)
assignments = tf.argmin(distance,0)
In [14]:
means = tf.concat(0, [tf.reduce_mean(tf.gather(vectors,
tf.reshape(tf.where(tf.equal(assignments,c)),[1,-1])),
reduction_indices=[1]) for c in xrange(k)])
In [15]:
update_centroides = tf.assign(centroides, means)
init_op = tf.initialize_all_variables()
In [16]:
sess = tf.Session()
sess.run(init_op)
for step in xrange(100):
_, centroid_values, assignment_values = sess.run([update_centroides,
centroides,
assignments])
In [17]:
print centroid_values
In [18]:
data = {"x": [], "y": [], "cluster": []}
for i in xrange(len(assignment_values)):
data["x"].append(vector_set[i][0])
data["y"].append(vector_set[i][1])
data["cluster"].append(assignment_values[i])
df = pd.DataFrame(data)
sns.lmplot("x", "y", data=df,
fit_reg=False, size=7,
hue="cluster", legend=False)
plt.show()
In [22]:
%matplotlib nbagg
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
la = np.linalg
words = ["I", "like", "enjoy","deep", "learning", "NLP", "flying", "."]
X = np.array([[0,2,1,0,0,0,0,0],
[2,0,0,1,0,1,0,0],
[1,0,0,0,0,0,1,0],
[0,1,0,0,1,0,0,0],
[0,0,0,1,0,0,0,1],
[0,1,0,0,0,0,0,1],
[0,0,1,0,0,0,0,1],
[0,0,0,0,1,1,1,0]])
U, s, Vh = la.svd(X, full_matrices=False)
for i in xrange(len(words)):
plt.text(U[i,0], U[i,1], words[i])