项目介绍 一个数据分析项目,所使用的代码都是普通代码,没有技术含量,chatgpt可以无痛做到,本文在此只提供和总结一下分析数据的基本思路。
代码与释义 数据爬虫 从互联网获取数据当然是要从爬虫开始,使用python可以很轻松的写出一个可用的爬虫。本文以某网站为例,合法爬取部分数据用于学习交流。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 import requests import pprintimport csvimport timekeyword = 'eye' f = open ('./data/' +keyword+'.csv' , mode='a' , encoding='utf-8' , newline='' ) csv_writer = csv.DictWriter(f, fieldnames=[ '标题' , '品牌' , '原价' , '折扣' , '售价' , '属性' , '详情页' , '评论1' , '评论2' ]) csv_writer.writeheader()
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 headers = { 'referer' : 'https://category.vip.com/' , 'user-agent' : 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/97.0.4692.71 Safari/537.36' } def get_shop_info (shop_id ): shop_url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/product/module/list/v2' data = { 'app_name' : 'shop_pc' , 'app_version' : '4.0' , 'warehouse' : 'VIP_NH' , 'fdc_area_id' : '104104101' , 'client' : 'pc' , 'mobile_platform' : '1' , 'province_id' : '104104' , 'api_key' : '70f71280d5d547b2a7bb370a529aeea1' , 'user_id' : '' , 'mars_cid' : '1641815303238_957cedc5b831e57207fd8334dcd97297' , 'wap_consumer' : 'a' , 'productIds' : shop_id, 'scene' : 'search' , 'standby_id' : 'nature' , 'extParams' : '{"stdSizeVids":"","preheatTipsVer":"3","couponVer":"v2","exclusivePrice":"1","iconSpec":"2x","ic2label":1}' , 'context' : '' , '_' : '1641816680425' , } response = requests.get(url=shop_url, params=data, headers=headers) print ('response.json():' ,response.json())
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 products = response.json()['data' ]['products' ] import comment2 for product in products: shop_attrs_list = [j['name' ] + ':' + j['value' ] for j in product['attrs' ]] shop_attrs = ',' .join(shop_attrs_list) href = f'https://www.vipglobal.hk/detail-{product["brandId" ]} -{product["productId" ]} .html' c = comment2.get_comments(product["spuId" ],product["brandId" ]) dit = { '标题' : product['title' ], '品牌' : product['brandShowName' ], '原价' : product['price' ]['marketPrice' ], '折扣' : product['price' ]['mixPriceLabel' ], '售价' : product['price' ]['salePrice' ], '属性' : shop_attrs, '详情页' : href, '评论1' :c[0 ], '评论2' :c[1 ] } csv_writer.writerow(dit) print (dit)
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 for page in range (0 , 601 , 120 ): time.sleep(1 ) url = 'https://mapi.vip.com/vips-mobile/rest/shopping/pc/search/product/rank' data = { 'app_name' : 'shop_pc' , 'app_version' : '4.0' , 'warehouse' : 'VIP_NH' , 'fdc_area_id' : '104104101' , 'client' : 'pc' , 'mobile_platform' : '1' , 'province_id' : '104104' , 'api_key' : '70f71280d5d547b2a7bb370a529aeea1' , 'user_id' : '' , 'mars_cid' : '1641815303238_957cedc5b831e57207fd8334dcd97297' , 'wap_consumer' : 'a' , 'standby_id' : 'nature' , 'keyword' : keyword, 'lv3CatIds' : '' , 'lv2CatIds' : '' , 'lv1CatIds' : '' , 'brandStoreSns' : '' , 'props' : '' , 'priceMin' : '' , 'priceMax' : '' , 'vipService' : '' , 'sort' : '0' , 'pageOffset' : page, 'channelId' : '1' , 'gPlatform' : 'PC' , 'batchSize' : '120' , '_' : '1641816680423' , } response = requests.get(url=url, params=data, headers=headers) pid_list = [index['pid' ] for index in response.json()['data' ]['products' ]] string_1 = ',' .join(pid_list[:50 ]) string_2 = ',' .join(pid_list[50 :100 ]) string_3 = ',' .join(pid_list[100 :]) get_shop_info(string_1) get_shop_info(string_2) get_shop_info(string_3)
数据进一步处理与可视化 前一步的数据仅仅是到达能看的地步,离可以正式利用还有很大一步,因而要对数据进行进一步的清洗、转换等操作,另外也提供数据可视化与初步分析的操作代码,可以学习一下思路。
基础操作
1 2 3 4 import pandas as pd%matplotlib inline import matplotlib.pyplot as pltplt.style.use('seaborn-darkgrid' )
读取文件并初步可视化,主要观察到列有什么列,大体数据,有一个直观的感受
1 2 eye = pd.read_csv('./data/eye.csv' ) eye
进行数据列的一般性的统计,包括最大最小,几分位点等
数据的增删改查 原则:在数据处理中,要赋予属性明确含义,统一属性值编码,去除重复和可忽略字段,合理选择关联属性,以提高数据质量和分析效果。
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 eye eye['原价' ].fillna(method='backfill' ) eye['折扣' ].fillna(method='backfill' ) def str_process (discount ): if (isinstance (discount,float )): return discount while (not (discount[-1 ].isdigit())): discount = discount[:-1 ] print (discount) return float (discount)
可视化分析 1 2 3 eye2 = pd.read_csv('./data/eye2.csv' ,index_col=None ) eye2 eye2['品牌' ].value_counts()
1 2 3 4 5 6 import matplotlib.pyplot as pltplt.rcParams['font.sans-serif' ]=['Songti SC' ] plt.rcParams['axes.unicode_minus' ]=False eye2['原价' ].hist() plt.title('原价' ) plt.savefig('./1_0.png' ,dpi=200 )
1 2 3 eye2['折扣'].hist() plt.title('折扣') plt.savefig('./1_1png',dpi=200)
相关性图 1 2 3 4 5 attributes = ['原价' ,'售价' ,'折扣' ] eye3 = eye2[attributes] from pandas.plotting import scatter_matrixscatter_matrix(eye3[attributes], figsize = (12 ,8 )) plt.savefig('./2.png' ,dpi=200 )
heatmap 1 2 3 4 5 6 7 8 9 10 corr = eye3.corr() import numpy as npplt.xticks(np.arange(len (corr)), labels=attributes, rotation=45 , rotation_mode="anchor" , ha="right" ) plt.yticks(np.arange(len (corr)), labels=attributes) import seaborn as snssns.heatmap(corr) plt.savefig('./3.png' ,dpi = 200 )
文本处理 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 def func (str ): for i in range (len (str )-1 ): if str [i:i+2 ] == '类型' : return str [i+3 :i+5 ] def component (str ): for i in range (len (str )-1 ): if str [i:i+4 ] == '使用方式' : return str [i+5 :i+7 ] def skin (str ): for i in range (len (str )-1 ): if str [i:i+4 ] == '是否套装' : return str [i+5 :i+6 ] def money (num ): return int (num//50 ) eye3['售价' ] = eye2['售价' ].apply(money) eye3['类型' ] = eye2['属性' ].apply(func) eye3['使用方式' ] = eye2['属性' ].apply(component) eye3['是否套装' ] = eye2['属性' ].apply(skin) print ('eye3[\'类型\'].value_counts():' ,eye3['类型' ].value_counts())plt.bar(x =[ i for i in range (len (eye3['类型' ].value_counts().keys()))],height=eye3['类型' ].value_counts().values) plt.xticks(ticks = [ i for i in range (len (eye3['类型' ].value_counts().keys()))],labels = eye3['类型' ].value_counts().keys()) plt.savefig('./4_1.png' ,dpi=200 ) plt.show() print ('eye3[\'使用方式\'].value_counts():' ,eye3['使用方式' ].value_counts())plt.bar(x =[ i for i in range (len (eye3['使用方式' ].value_counts().keys()))],height=eye3['使用方式' ].value_counts().values) plt.xticks(ticks = [ i for i in range (len (eye3['使用方式' ].value_counts().keys()))],labels = eye3['使用方式' ].value_counts().keys(),rotation='vertical' ) plt.savefig('./4_2.png' ,dpi=200 ) plt.show() print ('eye3[\'是否套装\'].value_counts():' ,eye3['是否套装' ].value_counts())plt.bar(x =[ i for i in range (len (eye3['是否套装' ].value_counts().keys()))],height=eye3['是否套装' ].value_counts().values) plt.xticks(ticks = [ i for i in range (len (eye3['是否套装' ].value_counts().keys()))],labels = eye3['是否套装' ].value_counts().keys()) plt.savefig('./4_3.png' ,dpi=200 ) plt.show()
kmeans算法对价格进行估计 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 # 对数据进行填充 # eye3['原价'] = eye3['原价'].fillna(method='bfill') # eye3['折扣'] = eye3['折扣'].fillna(method='bfill') eye3 = eye3.dropna(subset=['原价']) eye3 = eye3.dropna(subset=['售价']) eye3 = eye3.dropna(subset=['折扣']) eye3['类型'] = eye3['类型'].fillna('非眉笔') eye3['使用方式'] = eye3['使用方式'].fillna('刀削') eye3['是否套装'] = eye3['是否套装'].fillna('否') eye3.to_csv('./data/eye3.csv',index=False) # 进行独热编码 eye3 = pd.get_dummies(eye3) eye3.head(10) from sklearn.neighbors import KNeighborsClassifier import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split y = eye3['售价'].values attributes_2 = ['原价', '折扣', '类型_US', '类型_单刀', '类型_眉毛', '类型_眉笔', '类型_眉粉', '类型_眉膏', '类型_眼线', '类型_非眉笔', '使用方式_刀削', '使用方式_自动', '是否套装_否', '是否套装_是'] X = eye3[attributes_2].values print(pd.Series(y).max()) print(pd.Series(y).min()) X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2) knn_clf = KNeighborsClassifier(n_neighbors=6) knn_clf.fit(X_train,y_train) y_predict = knn_clf.predict(X_test) sum(y_predict==y_test)/y_test.shape[0] print('y_predict:',y_predict) print('y_test:',y_test)
计算指标 1 2 3 4 5 import numpy as npfrom sklearn.metrics import roc_curve, auc, accuracy_score, precision_score, recall_score, f1_score, confusion_matrixprint (accuracy_score(y_pred=y_predict,y_true=y_test))print (confusion_matrix(y_pred=y_predict,y_true=y_test)