{"id":16245,"date":"2017-12-19T09:00:32","date_gmt":"2017-12-19T00:00:32","guid":{"rendered":"http:\/\/www.techscore.com\/blog\/?p=16245"},"modified":"2018-11-14T16:33:42","modified_gmt":"2018-11-14T07:33:42","slug":"connect_two_vector_spaces","status":"publish","type":"post","link":"https:\/\/www.techscore.com\/blog\/2017\/12\/19\/connect_two_vector_spaces\/","title":{"rendered":"\uff12\u3064\u306e\u7570\u306a\u308b\u8a00\u8a9e\u304b\u3089\u4f5c\u3089\u308c\u305f\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u7dda\u5f62\u5199\u50cf\u3059\u308b"},"content":{"rendered":"
\u3053\u3093\u306b\u3061\u306f\u3001\u5ca1\u5d0e\u3067\u3059\u3002 \u4eca\u56de\u7d39\u4ecb\u3059\u308b\u306e\u306f\u300c\u7570\u306a\u308b\u8a00\u8a9e\u304b\u3089\u4f5c\u3089\u308c\u305f\uff12\u3064\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u3064\u306a\u3052\u308b\u5909\u63db\u884c\u5217\u3092\u5b66\u7fd2\u3059\u308b\u300d\u3068\u3044\u3046\u8a71\u3067\u3059\u3002<\/p>\n \u3053\u308c\u306f\u5143\u3005\u3001\u6a5f\u68b0\u7ffb\u8a33\u3067\u5fc5\u8981\u306a\u300c\u5358\u8a9e\u30fb\u30d5\u30ec\u30fc\u30ba\u8f9e\u66f8\u300d\u3092\u81ea\u52d5\u7684\u306b\u751f\u6210\u3059\u308b\u305f\u3081\u306b Google \u306e Tomas Mikolov \u306b\u3088\u3063\u3066\u63d0\u6848\u3055\u308c\u307e\u3057\u305f<\/a>\u3002*1<\/em><\/a><\/p>\n \u6a5f\u68b0\u7ffb\u8a33\u3068\u306f\u7aef\u7684\u306b\u8a00\u3046\u3068\u3001\u7247\u65b9\u306e\u8a00\u8a9e\u306e\u8a00\u8449\u3092\u4ed6\u65b9\u306e\u8a00\u8a9e\u306e\u8a00\u8449\u306b\u5909\u63db\u3059\u308b\u3053\u3068\u3067\u3059\u3002 \u305d\u3053\u3067 Mikolov \u306f\u81ea\u52d5\u7684\u306b\u3053\u306e\u8f9e\u66f8\u3092\u751f\u6210\u3059\u308b\u3053\u3068\u304c\u3067\u304d\u306a\u3044\u304b\u3068\u8003\u3048\u307e\u3057\u305f\u3002 \u305d\u3057\u3066\u3001\u305d\u308c\u305e\u308c\u306e\u7a7a\u9593\u306f\u30b7\u30f3\u30d7\u30eb\u306b\u7dda\u5f62\u5199\u50cf\u3067\u304d\u3066\uff08\u7a7a\u9593\u3067\u8a00\u3046\u3068\u3053\u308d\u306e\u56de\u8ee2\u3084\u30b9\u30b1\u30fc\u30ea\u30f3\u30b0\uff09\u3001\u534a\u81ea\u52d5\u7684\u306b\u8f9e\u66f8\u3092\u4f5c\u308b\u3053\u3068\u304c\u3067\u304d\u308b\u306e\u3067\u306f\u306a\u3044\u304b\u3068\u8003\u3048\u3001\u305d\u306e\u53ef\u80fd\u6027\u3092\u3053\u306e\u8ad6\u6587\u3067\u793a\u3057\u307e\u3057\u305f\u3002<\/p>\n \u3053\u3053\u3067\u306f Mikolov \u306f\u4e00\u3064\u306e\u5927\u304d\u306a\u4eee\u5b9a\u3092\u7f6e\u3044\u3066\u3044\u307e\u3059\u3002 \u8003\u3048\u65b9\u3068\u30a2\u30eb\u30b4\u30ea\u30ba\u30e0\u306f\u3068\u3066\u3082\u30b7\u30f3\u30d7\u30eb\u3067\u3059\u3002<\/p>\n \u8ad6\u6587\u306b\u3088\u308b\u3068\u3001\u5909\u63db\u884c\u5217\u306f\u6b21\u306e\u640d\u5931\u95a2\u6570\u3092\u6700\u5c0f\u5316\u3059\u308b\u3088\u3046\u306b\u78ba\u7387\u7684\u52fe\u914d\u964d\u4e0b\u6cd5\u3067\u5b66\u7fd2\u3057\u307e\u3059\u3002<\/p>\n \u3053\u3053\u3067 W \u306f\u5b66\u7fd2\u3059\u308b\u5909\u63db\u884c\u5217\u3001x \u306f\u5199\u50cf\u5143\u3001z \u306f\u5199\u50cf\u5148\u3001n \u306f\u7a2e\u306e\u5358\u8a9e\u6570\u3067\u3059\u3002<\/p>\n \u3053\u3046\u3084\u3063\u3066\u5b66\u7fd2\u3055\u308c\u305f\u5909\u63db\u884c\u5217\u306f\u3001\u305d\u308c\u305e\u308c\u306e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u30b7\u30f3\u30d7\u30eb\u306b\u7dda\u5f62\u5199\u50cf\u3059\u308b\u3082\u306e\u3068\u306a\u308a\u307e\u3059\u3002<\/p>\n \u4eca\u56de\u306f tensorflow \u3092\u7528\u3044\u307e\u3057\u305f\u3002<\/p>\n \u4eca\u56de\u306f\u300c\u7570\u306a\u308b\u8a00\u8a9e\u304b\u3089\u4f5c\u3089\u308c\u305f\uff12\u3064\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u3064\u306a\u3052\u308b\u5909\u63db\u884c\u5217\u3092\u5b66\u7fd2\u3059\u308b\u300d\u3068\u3044\u3046\u8a71\u3092\u3057\u307e\u3057\u305f\u3002<\/p>\n \u30a2\u30eb\u30b4\u30ea\u30ba\u30e0\u3068\u3057\u3066\u306f\u3068\u3066\u3082\u30b7\u30f3\u30d7\u30eb\u3067\u3001\u524d\u8ff0\u306e\u5f37\u3044\u4eee\u5b9a\u300c\u7570\u306a\u308b\u30c9\u30e1\u30a4\u30f3\u306e\u30c7\u30fc\u30bf\u306b\u304a\u3044\u3066\u3001\u305d\u308c\u305e\u308c\u306e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u304c\u4f3c\u305f\u5e7e\u4f55\u5b66\u7684\u7a7a\u9593\u3092\u69cb\u6210\u3057\u3066\u3044\u308b\u300d<\/strong>\u304c\u305d\u308c\u3092\u53ef\u80fd\u306b\u3057\u3066\u3044\u307e\u3059\u3002 \u4f8b\u3048\u3070\u3001\u4f3c\u305f\u3088\u3046\u306a\u5546\u6750\u3092\u6271\u3063\u3066\u3044\u308b EC \u3067\u3001\u53cc\u65b9\u3067\u4f3c\u305f\u3088\u3046\u306a\u58f2\u308c\u65b9\u3092\u3057\u3066\u3044\u308b\u5546\u54c1\u304c\u308f\u304b\u308b\u3088\u3046\u306b\u306a\u308b\u304b\u3082\u3057\u308c\u307e\u305b\u3093\u3002 \u4eca\u3001\u6a5f\u68b0\u5b66\u7fd2\u3067\u306f\u300c\u8ee2\u79fb\u5b66\u7fd2\u300d\u3084\u300c\u30c9\u30e1\u30a4\u30f3\u9069\u5fdc\u300d<\/a>\u304c\u6ce8\u76ee\u3055\u308c\u3066\u3044\u307e\u3059\u3002 \u6a5f\u68b0\u5b66\u7fd2\u3092\u3057\u3066\u3044\u308b\u3068\u3001\u5c11\u306a\u3044\u30c7\u30fc\u30bf\u3084\u504f\u3063\u305f\u30c7\u30fc\u30bf\u306a\u3069\u3092\u4f7f\u308f\u3056\u308b\u3092\u5f97\u306a\u3044\u5834\u5408\u3082\u591a\u3005\u3042\u308b\u3068\u601d\u3044\u307e\u3059\u3002 \u3053\u3093\u306b\u3061\u306f\u3001\u5ca1\u5d0e\u3067\u3059\u3002 \u4eca\u56de\u7d39\u4ecb\u3059\u308b\u306e\u306f\u300c\u7570\u306a\u308b\u8a00\u8a9e\u304b\u3089\u4f5c\u3089\u308c\u305f\uff12\u3064\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u3064\u306a\u3052\u308b\u5909\u63db\u884c\u5217\u3092\u5b66\u7fd2\u3059\u308b\u300d\u3068\u3044\u3046\u8a71\u3067\u3059\u3002
\n\u3053\u306e\u8a18\u4e8b\u306f TECHSCORE Advent Calendar 2017<\/a> \u306e 19 \u65e5\u76ee\u306e\u8a18\u4e8b\u3067\u3059\u3002<\/p>\n\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u3092\u3064\u306a\u3052\u308b\u5909\u63db\u884c\u5217\u3068\u306f<\/h2>\n
\n\u5909\u63db\u306b\u306f\uff12\u3064\u306e\u8a00\u8a9e\u9593\u306e\u5358\u8a9e\u3084\u30d5\u30ec\u30fc\u30ba\u3092\u30de\u30c3\u30d4\u30f3\u30b0\u3057\u305f\u8f9e\u66f8 (\u65e5\u672c\u8a9e\u300c\u732b\u300d:\u82f1\u8a9e\u300ccat\u300d\u306a\u3069\u306e\u610f\u5473\u5bfe\u5fdc\u8f9e\u66f8) \u3092\u7528\u3044\u307e\u3059\u304c\u3001\u7db2\u7f85\u6027\u304c\u3042\u308b\u826f\u3044\u8f9e\u66f8\u3092\u4f5c\u308b\u306e\u306f\u3068\u3066\u3082\u624b\u9593\u304c\u304b\u304b\u308a\u307e\u3059\u3002*2<\/em><\/a><\/p>\n
\nMikolov \u306f\u69d8\u3005\u306a\u8a00\u8a9e\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u3092\u53ef\u8996\u5316\u3057\u3066\u3044\u308b\u3068\u304d\u306b\u300c\u7570\u306a\u308b\u8a00\u8a9e\u3067\u3082\u4f3c\u305f\u3088\u3046\u306a\u5358\u8a9e\u306e\u4f4d\u7f6e\u95a2\u4fc2\u304c\u3064\u304f\u3089\u308c\u308b\uff08\u76f8\u5bfe\u7684\u306a\u4f4d\u7f6e\u95a2\u4fc2\u304c\u4f3c\u3066\u3044\u308b\uff09\u3053\u3068\u300d\u306b\u6c17\u4ed8\u304d\u307e\u3057\u305f\u3002<\/p>\n
\n\u305d\u308c\u306f\u3001\u300c\u7570\u306a\u308b\u8a00\u8a9e\u306b\u304a\u3044\u3066\u3082\u3001\u305d\u308c\u305e\u308c\u306e\u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u304c\u4f3c\u305f\u5e7e\u4f55\u5b66\u7684\u7a7a\u9593\u3092\u69cb\u6210\u3057\u3066\u3044\u308b\u300d<\/strong>\u3068\u3044\u3046\u3053\u3068\u3067\u3059\u3002
\n\u305d\u3057\u3066\u305d\u308c\u6545\u306b\u30b7\u30f3\u30d7\u30eb\u306b\u7dda\u5f62\u5199\u50cf\u3067\u304d\u308b\u3001\u3068\u7d50\u8ad6\u3065\u3051\u3066\u3044\u307e\u3059\u3002<\/p>\n<\/a><\/p>\n
\u30a2\u30eb\u30b4\u30ea\u30ba\u30e0<\/h2>\n
\n
\n<\/p>\n
\u30b3\u30fc\u30c9<\/h2>\n
#!\/usr\/bin\/env python\r\n# -*- coding: utf-8 -*-\r\nimport sys\r\nimport os\r\nimport math\r\nimport numpy as np\r\nos.environ['TF_CPP_MIN_LOG_LEVEL']='2'\r\nimport tensorflow as tf\r\nimport gensim, logging\r\nlogging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)\r\n\"\"\"\r\n python 2.7.14\r\n gensim (3.2.0)\r\n tensorflow (1.4.1)\r\n numpy (1.13.3)\r\n\"\"\"\r\n \r\n\r\n# 1. \u307e\u305a\u3001\u305d\u308c\u305e\u308c\u306e\u8a00\u8a9e\u3067\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\uff08CBOW\/skip-gram \u306a\u3069\uff09\u3092\u6e96\u5099\u3057\u307e\u3059\r\n# (\u3053\u3053\u3067\u306f word2vec \u30d0\u30a4\u30ca\u30ea\u30d5\u30a9\u30fc\u30de\u30c3\u30c8\u306e\u30e2\u30c7\u30eb\u3092\u60f3\u5b9a\u3057\u3066\u3044\u307e\u3059\u3002)\r\nmodel_file_a = \"language_A.bin\"\r\nmodel_file_b = \"lbngubge_B.bin\"\r\nmodel_a = gensim.models.KeyedVectors.load_word2vec_format(model_file_a, binary=True)\r\nmodel_b = gensim.models.KeyedVectors.load_word2vec_format(model_file_b, binary=True)\r\n\r\n\r\n# 2. \u6b21\u306b\u3001\u305d\u308c\u305e\u308c\u306e\u8a00\u8a9e\u3092\u3064\u306a\u3050\u4e0a\u3067\u300c\u7a2e\u300d\u3068\u306a\u308b\u5358\u8a9e\u5bfe\u5fdc\u8f9e\u66f8\u3092\u6e96\u5099\u3057\u307e\u3059\r\n# \u4e88\u3081\u6e96\u5099\u3055\u308c\u3066\u3044\u308b\u3068\u3057\u307e\u3059( vocab_dic = {'dog':'perro', 'horse':'caballo' .... )\u3002\r\n\r\n# 2'. \u305d\u308c\u305e\u308c\u306e\u8a00\u8a9e\u306e\u7a2e\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u3092\u62bd\u51fa\u3057\u307e\u3059.\r\nvocab_a, vocab_b = vocab_dic.keys(), vocab_dic.values()\r\nvec_a = model_a[vocab_a]\r\nvec_b = model_b[vocab_b]\r\n\r\n\r\n# 3. \u305d\u308c\u305e\u308c\u306e\u8a00\u8a9e\u306e\u7a2e\u306e\u5358\u8a9e\u30d9\u30af\u30c8\u30eb\u304b\u3089\u7dda\u5f62\u5199\u50cf\u884c\u5217\u3092\u5b66\u7fd2\u3057\u307e\u3059\u3002\r\n\r\n# \u30d9\u30af\u30c8\u30eb\u7a7a\u9593\u6b21\u5143\u6570\r\n_, dvec_x = vec_a.shape\r\n_, dvec_z = vec_b.shape\r\n\r\n# \u5199\u50cf\u5143\r\nx = tf.placeholder(tf.float32, [None, dvec_x])\r\n\r\n# \u5199\u50cf\u5148\r\nz = tf.placeholder(tf.float32, [None, dvec_z])\r\n\r\n# \u5909\u63db\u884c\u5217:\u6a19\u6e96\u504f\u5dee 0.01 \u306e\u30ab\u3099\u30a6\u30b9\u5206\u5e03\u3066\u3099\u521d\u671f\u5316\r\nW = tf.Variable(tf.random_normal([dvec_x, dvec_z], stddev=0.01))\r\n\r\n# \u640d\u5931\u95a2\u6570\u5b9a\u7fa9 $L = \\sum_{i=1}^n || Wx_i - z_i ||^2$\r\nloss = tf.reduce_sum(tf.square(z - tf.matmul(x, tf.transpose(W))))\r\n\r\n# Adam \u30aa\u30d7\u30c6\u30a3\u30de\u30a4\u30b6\u3092\u78ba\u7387\u7684\u52fe\u914d\u964d\u4e0b\u6cd5\u306b\u4f7f\u3044\u307e\u3059.\r\ntrain_step = tf.train.AdamOptimizer(learning_rate=0.001).minimize(loss)\r\n\r\n\r\ndef next_batch(num, data, labels):\r\n '''\r\n \u30d0\u30c3\u30c1\u51e6\u7406\u95a2\u6570\r\n https:\/\/stackoverflow.com\/questions\/40994583\/how-to-implement-tensorflows-next-batch-for-own-data\/40995666\r\n Return a total of `num` random samples and labels. \r\n '''\r\n idx = np.arange(0 , len(data))\r\n np.random.shuffle(idx)\r\n idx = idx[:num]\r\n data_shuffle = [data[ i] for i in idx]\r\n labels_shuffle = [labels[ i] for i in idx]\r\n\r\n return np.asarray(data_shuffle), np.asarray(labels_shuffle)\r\n\r\n# \u5b66\u7fd2\r\nwith tf.Session() as sess:\r\n\r\n sess.run(tf.global_variables_initializer())\r\n \r\n # \u30cf\u3099\u30c3\u30c1\u30b5\u30a4\u30b9\u3099 100 \u306e\u78ba\u7387\u7684\u52fe\u914d\u964d\u4e0b\u6cd5\u309210000\u56de\u884c\u3046\r\n for i in range(10000):\r\n \r\n batch_x, batch_z = next_batch(100, vec_a, vec_b)\r\n sess.run(train_step, feed_dict={x: batch_x, z:batch_z})\r\n if i % 1000 == 0:\r\n print 'Step: %d, Loss: %f'% (i, sess.run(loss, feed_dict={x: batch_x, z:batch_z}))\r\n \r\n\r\n # \u5b66\u7fd2\u3057\u305f\u5909\u63db\u884c\u5217\u3092\u62bd\u51fa\r\n W_ = sess.run([W])\r\n\r\nprint W_[0]\r\n\r\n<\/pre>\n
\u307e\u3068\u3081<\/h2>\n
\n\u3053\u306e\u8ad6\u6587\u306e\u809d\u306f\u3053\u306e\u4eee\u5b9a\u3067\u3059\u304c\u3001\u305d\u308c\u304c\u6210\u308a\u7acb\u3061\u305d\u3046\u306a\u30c7\u30fc\u30bf\u3067\u3042\u308c\u3070\u30b7\u30f3\u30d7\u30eb\u306a\u3060\u3051\u306b\u69d8\u3005\u306a\u3053\u3068\u306b\u5fdc\u7528\u3067\u304d\u305d\u3046\u3067\u3059\u3002<\/p>\n
\n\u7ffb\u8a33\u306e\u6587\u8108\u3067\u63d0\u6848\u3055\u308c\u305f\u6280\u8853\u3067\u3059\u304c\u3001\u30de\u30fc\u30b1\u30c6\u30a3\u30f3\u30b0\u306b\u3082\u6d3b\u7528\u3067\u304d\u308b\u3068\u3057\u305f\u3089\u304a\u3082\u3057\u308d\u3044\u3067\u3059\u306d\u3002<\/p>\n
\n\u300c\u8ee2\u79fb\u5b66\u7fd2\u300d\u3084\u300c\u30c9\u30e1\u30a4\u30f3\u9069\u5fdc\u300d\u306b\u304a\u3044\u3066\u3082\u3001\u7570\u306a\u308b\u30c9\u30e1\u30a4\u30f3\u306e\u30c7\u30fc\u30bf\u306e\u5206\u5e03\u306e\u9055\u3044\u306b\u5bfe\u3059\u308b\u4eee\u5b9a\u3092\u3069\u306e\u3088\u3046\u306b\u7f6e\u304f\u306e\u304b\u3001\u3068\u3044\u3046\u306e\u306f\u5927\u5207\u306a\u30dd\u30a4\u30f3\u30c8\u3068\u306a\u308a\u307e\u3059\u3002
\n\u3053\u306e Mikolov \u306e\u8ad6\u6587\u306f\u5c11\u3057\u6614\u306e\u3082\u306e\u306a\u306e\u3067\u3059\u304c\u3001\u305d\u3046\u3044\u3063\u305f\u89b3\u70b9\u3067\u307f\u308b\u3068\u4ee5\u524d\u8aad\u3093\u3060\u3068\u304d\u3068\u306f\u307e\u305f\u7570\u306a\u308b\u6349\u3048\u65b9\u304c\u3067\u304d\u3066\u65b0\u9bae\u3067\u3057\u305f\u3002<\/p>\n
\n\u65e2\u306b\u5b9a\u8a55\u306e\u3042\u308b\u30e2\u30c7\u30eb\u3084\u76f4\u63a5\u7684\u306b\u95a2\u4fc2\u306e\u306a\u3044\u30c7\u30fc\u30bf\u3092\u30c9\u30e1\u30a4\u30f3\u9069\u5fdc\u3057\u3066\u6d3b\u7528\u3059\u308b\u3053\u3068\u3082\u4eca\u5f8c\u306f\u5897\u3048\u308b\u3067\u3057\u3087\u3046\u3057\u3001\u305d\u3046\u3044\u3063\u305f\u3068\u3053\u308d\u306b\u3082\u65b0\u305f\u306a\u30d3\u30b8\u30cd\u30b9\u30b7\u30fc\u30c9\u304c\u51fa\u3066\u304f\u308b\u304b\u3082\u3057\u308c\u307e\u305b\u3093\u3002<\/p>\n\u53c2\u7167<\/h2>\n
\n
\n\u3053\u306e\u8a18\u4e8b\u306f TECHSCORE Advent Calendar 2017 \u306e 19 \u65e5\u76ee\u306e\u8a18\u4e8b\u3067\u3059\u3002<\/p>\n
\u7d9a\u304d\u3092\u8aad\u3080...<\/a><\/p>\n","protected":false},"author":48,"featured_media":0,"comment_status":"closed","ping_status":"open","sticky":false,"template":"","format":"standard","meta":[],"categories":[276,254,259,18],"tags":[141,120,202],"_links":{"self":[{"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/posts\/16245"}],"collection":[{"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/users\/48"}],"replies":[{"embeddable":true,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/comments?post=16245"}],"version-history":[{"count":25,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/posts\/16245\/revisions"}],"predecessor-version":[{"id":16461,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/posts\/16245\/revisions\/16461"}],"wp:attachment":[{"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/media?parent=16245"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/categories?post=16245"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.techscore.com\/blog\/wp-json\/wp\/v2\/tags?post=16245"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}