# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# Any results you write to the current directory are saved as output.

/kaggle/input/amazon-fine-food-reviews/database.sqlite
/kaggle/input/amazon-fine-food-reviews/hashes.txt
/kaggle/input/amazon-fine-food-reviews/Reviews.csv

import nltk
from nltk.corpus import stopwords
import re
from sklearn.feature_extraction.text import TfidfVectorizer

r = pd.read_csv("/kaggle/input/amazon-fine-food-reviews/Reviews.csv")
r.head()

# ProductId를 기준으로 Text 통합
r = r.groupby('ProductId')['Text'].agg(lambda col: ''.join(col))
r.head()

ProductId
0006641040    These days, when a person says, "chicken soup"...
141278509X    This product by Archer Farms is the best drink...
2734888454    My dogs loves this chicken but its a product f...
2841233731    This book is easy to read and the ingredients ...
7310172001    This product is a very health snack for your p...
Name: Text, dtype: object

r = pd.DataFrame({'ProductId':r.index, 'Text':r.values})
r.head()

tfidf = TfidfVectorizer(stop_words="english", max_features=10000)
response = tfidf.fit_transform(r["Text"])
tfidf

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=10000,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words='english', strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

r.head(20)

# 상위 20개 항목을 대상으로 해당 항목을 가장 잘 설명하는 단어 다섯개 출력
feature_names = np.array(tfidf.get_feature_names())
n = 5

for i in range(20):
    tfidf_sorting = np.argsort(response[i].toarray()).flatten()[::-1]
    print(feature_names[tfidf_sorting][:n])

['book' 'soup' 'children' 'chicken' 'read']
['farms' 'packet' 'contains' 'fruit' 'natural']
['china' 'wont' 'dogs' 'imports' 'chicken']
['books' 'ingredients' 'stores' 'book' 'recipes']
['treats' 'liver' 'dog' 'dogs' 'br']
['treats' 'liver' 'dog' 'dogs' 'br']
['tin' 'cookies' 'grams' 'dollars' 'sweet']
['does' 'tummy' 'bother' 'sensitive' 'stomach']
['flies' 'fly' 'trap' 'caught' 'traps']
['victor' 'fly' 'bait' 'traps' 'unreal']
['quot' 'windows' 'printed' 'car' 'reverse']
['beetlejuice' 'movie' 'burton' 'keaton' 'film']
['beetlejuice' 'movie' 'burton' 'keaton' 'film']
['trap' 'mole' 'moles' 'traps' 'spikes']
['bees' 'trap' 'traps' 'yellow' 'ive']
['gopher' 'trap' 'traps' 'set' 'gophers']
['trap' 'mole' 'moles' 'traps' 'set']
['fleas' 'flea' 'trap' 'traps' 'br']
['flies' 'fly' 'trap' 'traps' 'bait']
['flies' 'trap' 'fly' 'traps' 'victor']

	Id	ProductId	UserId	ProfileName	HelpfulnessNumerator	HelpfulnessDenominator	Score	Time	Summary	Text
0	1	B001E4KFG0	A3SGXH7AUHU8GW	delmartian	1	1	5	1303862400	Good Quality Dog Food	I have bought several of the Vitality canned d...
1	2	B00813GRG4	A1D87F6ZCVE5NK	dll pa	0	0	1	1346976000	Not as Advertised	Product arrived labeled as Jumbo Salted Peanut...
2	3	B000LQOCH0	ABXLMWJIXXAIN	Natalia Corres "Natalia Corres"	1	1	4	1219017600	"Delight" says it all	This is a confection that has been around a fe...
3	4	B000UA0QIQ	A395BORC6FGVXV	Karl	3	3	2	1307923200	Cough Medicine	If you are looking for the secret ingredient i...
4	5	B006K2ZZ7K	A1UQRSCLF8GW1T	Michael D. Bigham "M. Wassir"	0	0	5	1350777600	Great taffy	Great taffy at a great price. There was a wid...

	ProductId	Text
0	0006641040	These days, when a person says, "chicken soup"...
1	141278509X	This product by Archer Farms is the best drink...
2	2734888454	My dogs loves this chicken but its a product f...
3	2841233731	This book is easy to read and the ingredients ...
4	7310172001	This product is a very health snack for your p...
5	7310172101	This product is a very health snack for your p...
6	7800648702	This came in a HUGE tin, much bigger than I ex...
7	9376674501	Our little dog has a very sensitive stomach. ...
8	B00002N8SM	I don't know how this product performs with bi...
9	B00002NCJC	Why is this $[...] when the same product is av...
10	B00002Z754	I just received my shipment and could hardly w...
11	B00004CI84	If this is what the afterlife is going to be l...
12	B00004CXX9	If this is what the afterlife is going to be l...
13	B00004RAMS	I have a large yard with several moles. I cal...
14	B00004RAMV	Large, therefore keeps its content for a while...
15	B00004RAMX	Picked up a set of 2 at the local home improve...
16	B00004RAMY	The trap itself seems to be great. The instru...
17	B00004RBDU	Both my cat and dogs have fleas and scratching...
18	B00004RBDW	This fly trap works. Period. Yes, it can stink...
19	B00004RBDZ	I live in Panama and for three or four months ...

[CB Filtering] 4. Profile Learner(1) (0)	2020.04.15
[CB Filtering] 2. Keyword-based Vector Space Model (0)	2020.03.16
[CB Filtering] 1. 개요 (0)	2020.03.12

내 블로그 - 관리자 홈 전환	`Q` `Q`
새 글 쓰기	`W` `W`

글 수정 (권한 있는 경우)	`E` `E`
댓글 영역으로 이동	`C` `C`

데이터 맛집

[CB Filtering] 3. TF-IDF Practice

'추천 시스템 > Content-based Filtering' 카테고리의 다른 글

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역

이 페이지의 URL 복사	`S` `S`
맨 위로 이동	`T` `T`
티스토리 홈 이동	`H` `H`
단축키 안내	`Shift` + `/` `⇧` + `/`

[CB Filtering] 3. TF-IDF Practice

'추천 시스템 > Content-based Filtering' 카테고리의 다른 글

'추천 시스템/Content-based Filtering' Related Articles

티스토리툴바

개인정보

단축키

내 블로그

블로그 게시글

모든 영역