import graphlab
# Limit number of worker processes. This preserves system memory, which prevents hosted notebooks from crashing.
graphlab.set_runtime_config('GRAPHLAB_DEFAULT_NUM_PYLAMBDA_WORKERS', 4)
Loading reviews for a set of baby products.
products = graphlab.SFrame('amazon_baby.gl/')
Data includes the product name, the review text and the rating of the review.
products.head()
graphlab.text_analytics.count_words():分词并统计
products['word_count'] = graphlab.text_analytics.count_words(products['review'])
products.head()
graphlab.canvas.set_target('ipynb')
products['name'].show()
探索一款长颈鹿玩具:Vulli Sophie the Giraffe Teether
giraffe_reviews = products[products['name'] == 'Vulli Sophie the Giraffe Teether']
len(SFrame):查询有多少行
len(giraffe_reviews) # 查询点评数量
giraffe_reviews['rating'].show(view='Categorical')
建立情感分类器
products['rating'].show(view='Categorical')
We will ignore all reviews with rating = 3, since they tend to have a neutral sentiment. Reviews with a rating of 4 or higher will be considered positive, while the ones with rating of 2 or lower will have a negative sentiment. 假定 5 星和 4 星为好评,3 星忽略,2 星和 1 星为差评
# ignore all 3* reviews
products = products[products['rating'] != 3]
# positive sentiment = 4* or 5* reviews
products['sentiment'] = products['rating'] >=4
products.head()
train_data,test_data = products.random_split(.8, seed=0)
graphlab.logistic_classifier.create():建立逻辑回归分类器
sentiment_model = graphlab.logistic_classifier.create(train_data,
target='sentiment',
features=['word_count'],
validation_set=test_data)
利用 ROC 曲线研究混淆矩阵中假阳性和假阴性
sentiment_model.evaluate(test_data, metric='roc_curve')
sentiment_model.show(view='Evaluation')
giraffe_reviews['predicted_sentiment'] = sentiment_model.predict(giraffe_reviews, output_type='probability')
giraffe_reviews.head()
sort():排序(ascending 表示升序)
giraffe_reviews = giraffe_reviews.sort('predicted_sentiment', ascending=False) # 降序
giraffe_reviews.head()
giraffe_reviews[0]['review']
giraffe_reviews[1]['review']
giraffe_reviews[-1]['review']
giraffe_reviews[-2]['review']