Python的商品銷量預測系統,該怎麼導入本地的數據集

CSDN問答 2022-01-07 08:05:06 阅读数:634

python 商品
#%%# This Python 3 environment comes with many helpful analytics libraries installed# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python# For example, here's several helpful packages to loadimport numpy as np # linear algebraimport pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)# Input data files are available in the read-only "../input/" directory# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directoryimport osfor dirname, _, filenames in os.walk('/kaggle/input'): for filename in filenames: print(os.path.join(dirname, filename))# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" # You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session#%%import pandas as pdimport numpy as npimport matplotlib.pyplot as pltimport seaborn as snssns.set()#%%train=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/sales_train.csv')shops=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/shops.csv')items=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/items.csv')cat=pd.read_csv('/kaggle/input/competitive-data-science-predict-future-sales/item_categories.csv')train=train.sample(frac=0.1,random_state=42)#%%shop_id_map = {11: 10, 0: 57, 1: 58, 40: 39}train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'] = train.loc[train['shop_id'].isin(shop_id_map), 'shop_id'].map(shop_id_map)train.loc[train['shop_id'].isin(shop_id_map), 'shop_id']#%%shops['shop_city'] = shops['shop_name'].map(lambda x:x.split(' ')[0].strip('!'))shop_types = ['ТЦ', 'ТРК', 'ТРЦ', 'ТК', 'МТРЦ']shops['shop_type'] = shops['shop_name'].map(lambda x:x.split(' ')[1] if x.split(' ')[1] in shop_types else 'Others')shops.loc[shops['shop_id'].isin([12, 56]), ['shop_city', 'shop_type']] = 'Online' # 12和56號是網上商店shops.head(13)#%%shop_city_map = dict([(v,k) for k, v in enumerate(shops['shop_city'].unique())])shop_type_map = dict([(v,k) for k, v in enumerate(shops['shop_type'].unique())])shops['shop_city_code'] = shops['shop_city'].map(shop_city_map)shops['shop_type_code'] = shops['shop_type'].map(shop_type_map)shops.head(7)#%%items['item_name'] = items['item_name'].map(lambda x: ''.join(x.split(' '))) # 删除空格duplicated_item_name = items[items['item_name'].duplicated()]duplicated_item_name_rec = items[items['item_name'].isin(duplicated_item_name['item_name'])] #%%old_id = duplicated_item_name_rec['item_id'].values[::2]new_id = duplicated_item_name_rec['item_id'].values[1::2]old_new_map = dict(zip(old_id, new_id))train.loc[train['item_id'].isin(old_id), 'item_id'] = train.loc[train['item_id'].isin(old_id), 'item_id'].map(old_new_map)train[train['item_id'].isin(old_id)]#%%cat['item_type'] = cat['item_category_name'].map(lambda x: 'Игры' if x.find('Игры ')>0 else x.split(' -')[0].strip('\"')) cat.iloc[[32,-3, -2], -1] = ['Карты оплаты', 'Чистые носители', 'Чистые носители' ]item_type_map = dict([(v,k) for k, v in enumerate(cat['item_type'].unique())])cat['item_type_code'] = cat['item_type'].map(item_type_map)cat.head()#%%cat['sub_type'] = cat['item_category_name'].map(lambda x: x.split('-',1)[-1]) sub_type_map = dict([(v,k) for k, v in enumerate(cat['sub_type'].unique())])cat['sub_type_code'] = cat['sub_type'].map(sub_type_map)#%%items = items.merge(cat[['item_category_id', 'item_type_code', 'sub_type_code']], on='item_category_id', how='left')items.head()#%%import gcdel catgc.collect()#%%sns.jointplot('item_cnt_day', 'item_price', train, kind='scatter')#%%train_filtered = train[(train['item_cnt_day'] < 800) & (train['item_price'] < 70000)].copy()#%%outer = train[(train['item_cnt_day'] > 400) | (train['item_price'] > 40000)]#%%outer_set = train_filtered[train_filtered['item_id'].isin(outer['item_id'].unique())].groupby('item_id') fig, ax = plt.subplots(1,1,figsize=(10, 10))colors = sns.color_palette() + sns.color_palette('bright') # 使用調色板。默認顏色只有10來種,會重複使用,不便於觀察i = 1for name, group in outer_set: ax.plot(group['item_cnt_day'], group['item_price'], marker='o', linestyle='', ms=12, label=name, c=colors[i]) i += 1ax.legend()plt.show()#%%filtered = train[(train['item_cnt_day'] < 400) & (train['item_price'] < 45000)].copy()filtered.head()#%%filtered.drop(index=filtered[filtered['item_id'].isin([7238, 14173])].index, inplace=True)#%%del train, train_filteredgc.collect()#%%filtered.loc[filtered['item_price'] <= 0, 'item_price'] = 1249.0 # 用了同一個月同一個商店該商品的均價filtered[filtered['item_price'] <= 0]#%%filtered['turnover_day'] = filtered['item_price'] * filtered['item_cnt_day']#%%item_sales_monthly = filtered.pivot_table(columns='item_id', index='date_block_num', values='item_cnt_day', fill_value=0, aggfunc=sum)fig, axes = plt.subplots(1,2, figsize=(20, 8))item_sales_monthly.sum(1).plot(ax=axes[0], title='Total sales of each month', xticks=[i for i in range(0,34,2)]) # 每月總銷量item_sales_monthly.sum(0).plot(ax=axes[1], title='Total sales of each item') # 每個商品的總銷量plt.subplots_adjust(wspace=0.2)#%%top_sales = item_sales_monthly.sum().sort_values(ascending=False)#%%item_turnover_monthly = filtered.pivot_table(index= 'date_block_num', columns= 'item_id', values='turnover_day', fill_value=0, aggfunc=sum)item_sales_monthly = item_sales_monthly.drop(columns=top_sales[top_sales<=0].index, axis=1) # 去掉銷量為0和負值的商品item_turnover_monthly = item_turnover_monthly.drop(columns=top_sales[top_sales<=0].index, axis=1)total_turnover = item_turnover_monthly.sum().sum()#%%fig, axes = plt.subplots(1,2, figsize=(20, 8))item_turnover_monthly.sum(1).plot(ax=axes[0], title='Total turnovers of each month', xticks=[i for i in range(0,34,2)]) # 每月總營收item_turnover_monthly.sum(0).plot(ax=axes[1], title='Total turnovers of each item') # 每個商品的總營收plt.subplots_adjust(wspace=0.2)#%%top_turnover = item_turnover_monthly.sum().sort_values(ascending=False)#%%turnover_monthly = item_turnover_monthly.sum(1)sales_monthly = item_sales_monthly.sum(1)fig, axe1 = plt.subplots(1, 1, figsize=(16, 6))axe2 = axe1.twinx()axe1.plot(turnover_monthly.index, turnover_monthly.values, c='r')axe2.plot(sales_monthly.index, sales_monthly.values, c='b')axe2.grid(c='c', alpha=0.3)axe1.legend(['Monthly Turnover'],fontsize=13, bbox_to_anchor=(0.95, 1))axe2.legend(['Monthly Sales'],fontsize=13, bbox_to_anchor=(0.93, 0.9))axe1.set_ylabel('Monthly Turnover', c='r')axe2.set_ylabel('Monthly Sales', c='b')plt.show()#%%sales_growth = item_sales_monthly.loc[23].sum() - item_sales_monthly.loc[11].sum()sales_growth_rate = sales_growth / item_sales_monthly.loc[11].sum() * 100turnover_growth = item_turnover_monthly.loc[23].sum() - item_turnover_monthly.loc[11].sum()turnover_growth_rate = turnover_growth / item_turnover_monthly.loc[11].sum() * 100print( ' 銷售同比增長量為: %.2f ,同比增長率為: %.2f%%;\n' % (sales_growth, sales_growth_rate), '營收同比增長量為: %.2f ,同比增長率為: %.2f%%。' % (turnover_growth, turnover_growth_rate) )#%%filtered.groupby('shop_id')['item_cnt_day'].sum().sort_values().plot(kind='bar', figsize=(12, 6))#%%filtered.groupby('shop_id')['turnover_day'].sum().sort_values().plot(kind='bar', figsize=(12, 6))#%%filtered = filtered.merge(items.iloc[:,1:], on='item_id', how='left')filtered.head()#%%filtered.groupby('item_category_id')['turnover_day'].sum().sort_values().plot(kind='bar',figsize=(16,6), rot=0)#%%filtered = filtered.merge(shops[['shop_id','shop_city_code','shop_type_code']], on='shop_id', how='left')filtered.head()#%%filtered.groupby('shop_city_code')['item_cnt_day'].sum().plot(kind='bar',figsize=(12,6))#%%filtered.groupby('shop_type_code')['item_cnt_day'].sum().plot(kind='bar',figsize=(12,6))#%%shop_sales_monthly = filtered.pivot_table(index='date_block_num', columns='shop_id', values='item_cnt_day', fill_value=0, aggfunc=sum)shop_open_month_cnt = (shop_sales_monthly.iloc[-6:] > 0).sum() # 有銷量的記錄shop_open_month_cnt.head() # 每個店鋪最後半年裏有幾個月有銷量#%%item_selling_month_cnt = (item_sales_monthly.iloc[-6:] > 0).sum() open_shop = shop_sales_monthly[shop_open_month_cnt[shop_open_month_cnt == 6].index]item_zero = item_sales_monthly[item_selling_month_cnt[item_selling_month_cnt == 0].index]selling_item = item_sales_monthly[item_selling_month_cnt[item_selling_month_cnt > 0].index]cl_set = filtered[filtered['shop_id'].isin(open_shop.columns) & filtered['item_id'].isin(selling_item.columns)]#%%from itertools import productimport timets = time.time()martix = []for i in range(34): record = cl_set[cl_set['date_block_num'] == i] group = product([i],record.shop_id.unique(),record.item_id.unique()) martix.append(np.array(list(group))) cols = ['date_block_num', 'shop_id', 'item_id']martix = pd.DataFrame(np.vstack(martix), columns=cols)martix#%%from itertools import productimport timets = time.time()martix = []for i in range(34): record = filtered[filtered['date_block_num'] == i] group = product([i],record.shop_id.unique(),record.item_id.unique()) martix.append(np.array(list(group))) cols = ['date_block_num', 'shop_id', 'item_id']martix = pd.DataFrame(np.vstack(martix), columns=cols)martix#%%del cl_setdel item_sales_monthlydel item_turnover_monthlygc.collect()#%%group = filtered.groupby(['date_block_num', 'shop_id', 'item_id']).agg({'item_cnt_day': np.sum})group.columns = ['item_cnt_month']group.reset_index(inplace=True)del filtered#%%martix = pd.merge(martix, group, on=['date_block_num', 'shop_id', 'item_id'], how='left')martix.head()#%%martix = martix.fillna(0)#%%martix = martix.merge(shops[['shop_id', 'shop_type_code', 'shop_city_code']], on='shop_id', how='left')martix = martix.merge(items.drop(columns='item_name'), on='item_id', how='left')martix#%%martix['year'] = martix['date_block_num'].map(lambda x: x // 12 + 2013)martix['month'] = martix['date_block_num'].map(lambda x: x % 12)martix.head()#%%group = martix.groupby(['date_block_num','item_id']).agg({'item_cnt_month':'mean'})group.columns = ['item_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num', 'item_id'], how='left')group = martix.groupby(['date_block_num','shop_id']).agg({'item_cnt_month':'mean'})group.columns = ['shop_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num', 'shop_id'], how='left')group = martix.groupby(['date_block_num','item_category_id']).agg({'item_cnt_month':'mean'})group.columns = ['cat_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num', 'item_category_id'], how='left')group = martix.groupby(['date_block_num','shop_id','item_category_id']).agg({'item_cnt_month':'mean'})group.columns = ['shop_cat_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num','shop_id','item_category_id'], how='left')group = martix.groupby(['date_block_num', 'item_type_code']).agg({'item_cnt_month':'mean'})group.columns = ['itemtype_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num', 'item_type_code'], how='left')group = martix.groupby(['date_block_num', 'sub_type_code']).agg({'item_cnt_month':'mean'})group.columns = ['subtype_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num','sub_type_code'], how='left')group = martix.groupby(['date_block_num','shop_city_code','item_id']).agg({'item_cnt_month':'mean'})group.columns = ['city_item_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num','shop_city_code','item_id'], how='left')group = martix.groupby(['date_block_num','shop_type_code','item_id']).agg({'item_cnt_month':'mean'})group.columns = ['shoptype_item_cnt_month_avg']group.reset_index(inplace=True)martix = martix.merge(group, on=['date_block_num','shop_type_code','item_id'], how='left')martix.head()#%%del groupgc.collect()#%%def lag_feature(df, lags, col): tmp = df[['date_block_num','shop_id','item_id',col]] for i in lags: shifted = tmp.copy() shifted.columns = ['date_block_num','shop_id','item_id', col+'_lag_'+str(i)] shifted['date_block_num'] += i df = pd.merge(df, shifted, on=['date_block_num','shop_id','item_id'], how='left') return dfmartix = lag_feature(martix, [1,2,3,6,12], 'item_cnt_month')martix = lag_feature(martix, [1,2,3,6,12], 'item_cnt_month_avg')martix = lag_feature(martix, [1,2,3,6,12], 'shop_cnt_month_avg')martix.drop(columns=[ 'item_cnt_month_avg', 'shop_cnt_month_avg'], inplace=True) # 只保留特征的曆史信息gc.collect()martix = lag_feature(martix, [1,2,3,6,12], 'cat_cnt_month_avg')martix = lag_feature(martix, [1,2,3,6,12], 'shop_cat_cnt_month_avg')martix.drop(columns=['cat_cnt_month_avg', 'shop_cat_cnt_month_avg'], inplace=True)martix = lag_feature(martix, [1,2,3,6,12], 'itemtype_cnt_month_avg')martix = lag_feature(martix, [1,2,3,6,12], 'subtype_cnt_month_avg')martix.drop(columns=['itemtype_cnt_month_avg', 'subtype_cnt_month_avg'], inplace=True)martix = lag_feature(martix, [1,2,3,6,12], 'city_item_cnt_month_avg')martix = lag_feature(martix, [1,2,3,6,12], 'shoptype_item_cnt_month_avg')martix.drop(columns=[ 'city_item_cnt_month_avg','shoptype_item_cnt_month_avg'], inplace=True)#%%martix[martix.columns[:20]].isna().any()#%%train_set = martix[martix['date_block_num'] > 11].fillna(0)del martixgc.collect()#%%for col in train_set.columns: if col.find('code') >= 0: train_set[col] = train_set[col].astype(np.int8) elif train_set[col].dtype == 'float64': train_set[col] = train_set[col].astype(np.float32) elif train_set[col].dtype == 'int64': train_set[col] = train_set[col].astype(np.int16) train_set['item_type_code'] = train_set['item_type_code'].astype('category')train_set['sub_type_code'] = train_set['sub_type_code'].astype('category')train_set.info()#%%import lightgbm as lgbX_train = train_set[train_set['date_block_num'] < 32].drop(columns=['item_cnt_month']) # 訓練集的樣本特征Y_train = train_set[train_set['date_block_num'] < 32]['item_cnt_month'] # 訓練集的樣本標簽X_validate = train_set[train_set['date_block_num'] == 32].drop(columns=['item_cnt_month']) # 校對集Y_validate = train_set[train_set['date_block_num'] == 32]['item_cnt_month']X_test = train_set[train_set['date_block_num'] == 33].drop(columns=['item_cnt_month']) # 測試集#%%Y_true=train_set[train_set['date_block_num'] == 33]['item_cnt_month']#%%X_test[0:50]#%%del train_setgc.collect()#%%train_data = lgb.Dataset(data=X_train, label=Y_train)validate_data = lgb.Dataset(data=X_validate, label=Y_validate) #%%import timets = time.time()params = {"objective" : "regression", "metric" : "rmse", 'n_estimators':10000, 'early_stopping_rounds':50, "num_leaves" : 200, "learning_rate" : 0.01, "bagging_fraction" : 0.9, "feature_fraction" : 0.3, "bagging_seed" : 0}print('Start....', ts)lgb_model = lgb.train(params, train_data, valid_sets=[train_data, validate_data], verbose_eval=1000) print('End...', time.time() - ts)#%%lgb.plot_importance(lgb_model, max_num_features=40, figsize=(12, 8))plt.title("Featurertances")plt.show()#%%X_test.shape#%% Y_true.shape#%%Y_test = lgb_model.predict(X_test).clip(0, 20)#%%error = Y_test - Y_truermse = (error**2).mean()**0.5rmse#%%X_test.head(50)#%%Y_test[0:50]#%%Y_true[0:50]#%%

這是源代碼

另外這段的

錯誤是因為什麼

版权声明:本文为[CSDN問答]所创,转载请带上原文链接,感谢。 https://gsmany.com/2022/01/202201070805057592.html