# Korean word embedding vectors for "KMA tokenized text file".
# model_name = "word2vec-kowiki.model"	# Word2Vec model
# C> wv_KMA_tokens_train_ADD.py "word2vec-kowiki.model" "KMA tokenized text file"
# --> https://www.kaggle.com/pierremegret/gensim-word2vec-tutorial

from gensim.models import Word2Vec
import sys

def test():
		print(model.wv.get_vector(u'배우'))
		print(model.wv.get_vector(u'여배우'))
		
		print(model.wv.similarity(u'배우', u'여배우'))
		print(model.wv.similarity(u'배우', u'남자'))
		print(model.wv.similarity(u'남자', u'여배우'))
		
		print(model.wv.most_similar(positive=[u'남자'], topn=5))
		print(model.wv.most_similar(positive=[u'남자', u'여배우'], negative=[u'배우'], topn=5))

def get_sentences(filename):
		print(f"\nTraining Korean word embedding vectors for <{filename}>.\n")
		f = open(filename, "r", encoding='utf-8')
		#f = open(filename, "r", encoding='cp949')
		text = f.readlines()
		f.close()

		tokens = []
		for sent in text:
			tokens.append(sent.split())

		return tokens


if __name__ == "__main__":
		print("\nLoading Korean word embedding vectors for 'KMA tokenized text file'.\n")
		model_name = sys.argv[1]	# Word2Vec model -- 'word2vec-kowiki.model'
		file_name = sys.argv[2]		# 'KMA tokenized text file'

		model = Word2Vec.load(model_name)
		test()

		sentences = get_sentences(file_name)
		model.train(sentences, total_examples=model.corpus_count, epochs=30, report_delay=1)
		test()

		model_file = 'word2vec-' + file_name[:-4] + '-ADD.model'
		model.save(model_file)
		print(f"\nWord2Vec model file <{model_file}> is created!\n")
