import numpy as np
import joblib
import pandas as pd
import os,glob
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from pyecharts.charts import Bar,Grid
from pyecharts import options as opts
warnings.filterwarnings("ignore")
plt.rcParams["figure.figsize"] = (10,6)
plt.rcParams["font.sans-serif"] = ["SimHei"]
plt.rcParams["axes.unicode_minus"] = False
def get_data(path):
os.chdir(path)
filenames = glob.glob("team_data*.csv")
i=0
n=0
dfs=[]
for filename in filenames:
team_win=['达拉斯 小牛','迈阿密 热火',
'金州 勇士','克利夫兰 骑士',
'迈阿密 热火','圣安东尼奥 马刺',
'金州 勇士','金州 勇士',
'多伦多 猛龙','洛杉矶 湖人','密尔沃基 雄鹿']
df=pd.read_csv(filename)
df['冠军']=0
index = df[df.球队 == team_win[i]].index.tolist()[0]
df['冠军'].iloc[index] = 1
i+=1
dfs.append(df[index:index+1])
for filename in filenames:
team_los=['迈阿密 热火','奥克拉荷马城 雷霆',
'圣安东尼奥 马刺','迈阿密 热火',
'克利夫兰 骑士','金州 勇士',
'克利夫兰 骑士','克利夫兰 骑士',
'金州 勇士','迈阿密 热火','菲尼克斯 太阳']
df=pd.read_csv(filename)
df['冠军']=0
index = df[df.球队 == team_los[n]].index.tolist()[0]
df['冠军'].iloc[index] = 0
n+=1
dfs.append(df[index:index+1])
df = pd.concat(dfs,axis=0,ignore_index=True)
print(df)
return(df)
def pca_data(data):
from sklearn.decomposition import PCA
x=data.drop(data.columns[0],axis=1)
pca =PCA(n_components='mle')
reduced_X =pca.fit_transform(x)
return reduced_X
def fit_data(reduced_X,data):
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
x_train=np.array(reduced_X)
y=data['冠军']
y_train=np.array(y)
LR=LogisticRegression(solver='lbfgs',multi_class='multinomial')
RF=RandomForestClassifier(n_estimators=10)
GNB=GaussianNB()
ensemble=VotingClassifier(estimators=[('lr',LR),('rf',RF),('gnb',GNB)], voting='hard')
ensemble.fit(x_train,y_train)
LR.fit( x_train, y_train )
RF.fit( x_train, y_train )
GNB.fit( x_train, y_train )
print ( "LR - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, LR.predict(x_train)) )
print ( "RF - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, RF.predict(x_train)) )
print ( "GNB - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, GNB.predict(x_train)) )
print ( "ensemble - Accuracy (Train): %.4g" %
metrics.accuracy_score(y_train, ensemble.predict(x_train)) )
joblib.dump(RF, 'model.pickle')
def pred_data(path):
df=pd.read_csv(path+'本赛季.csv')
df['冠军']=0
x_test=pca_data(df)
RF=joblib.load('model.pickle')
dfs=[]
for i in range(100):
x=(RF.predict(x_test))
index=0
for i in x:
if i == 1 :
dfs.append(df[index:index+1])
index+=1
df = pd.concat(dfs,axis=0,ignore_index=True)
a = Counter(df['球队'])
df_new1=pd.DataFrame.from_dict(a.keys())
df_new2=pd.DataFrame.from_dict(a.values())
df_new=pd.concat([df_new1,df_new2],axis=1,ignore_index=True)
print(df_new)
sns.barplot(x=df_new[0],y=df_new[1],data=df_new)
a = plt.xticks(rotation=90)
plt.title('夺冠可能性')
plt.show()
def player_data(path):
os.chdir(path)
filenames = glob.glob("player*.csv")
df=pd.read_csv(filenames[0])
x=(df['得分']+df['篮板']+df['助攻']+df['抢断']+df['盖帽']-df['失误']-df['犯规'])/df['场次']
df_new=pd.concat([df['球员'],x],axis=1,ignore_index=True)
df_new = df_new.sort_values(by=1,axis=0,ascending=False)
df_new=df_new.head(10)
x=sns.barplot(x=df_new[0], y=df_new[1],data=df_new,palette='husl')
plt.xticks(rotation=90)
plt.title('个人贡献值前十球员')
plt.show()
print(df_new)
df1=df.drop('球员',axis=1)
x=df1.sum()
df1=df1/x
df=pd.concat([df['球员'],df1],axis=1,ignore_index=False)
dfs=[]
for i in range(df_new.shape[0]):
x=df.loc[df_new.index[i]]
name=x.loc[['球员']]
y=x.loc[['得分','防守效率','进攻效率','犯规','出场时间','失误']]
dfs.append(y)
labels=np.array(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
data=np.array(y)
angles=np.linspace(0, 2*np.pi,len(labels),endpoint=False)
labels=np.concatenate((labels,[labels[0]]))
data=np.concatenate((data,[data[0]]))
angles=np.concatenate((angles,[angles[0]]))
plt.polar(angles, data,'bo-',linewidth=1)
plt.thetagrids(angles*180/np.pi,labels)
plt.fill(angles, data,facecolor='b',alpha=0.25)
plt.title(str(name))
plt.show()
df1 = pd.concat(dfs,axis=1,ignore_index=True)
x0=np.array(df1[0].values)
x1=np.array(df1[1].values)
x2=np.array(df1[2].values)
x3=np.array(df1[3].values)
x4=np.array(df1[4].values)
x5=np.array(df1[5].values)
x6=np.array(df1[6].values)
x7=np.array(df1[7].values)
x8=np.array(df1[8].values)
x9=np.array(df1[9].values)
bar = (
Bar()
.add_xaxis(['得分','防守效率','进攻效率','犯规','出场时间','失误'])
.add_yaxis(df_new[0].iloc[0], list(x0))
.add_yaxis(df_new[0].iloc[1], list(x1))
.add_yaxis(df_new[0].iloc[2], list(x2))
.add_yaxis(df_new[0].iloc[3], list(x3))
.add_yaxis(df_new[0].iloc[4], list(x4))
.add_yaxis(df_new[0].iloc[5], list(x5))
.add_yaxis(df_new[0].iloc[6], list(x6))
.add_yaxis(df_new[0].iloc[7], list(x7))
.add_yaxis(df_new[0].iloc[8], list(x8))
.add_yaxis(df_new[0].iloc[9], list(x9))
.set_global_opts(datazoom_opts=opts.DataZoomOpts())
)
bar.render()
sns.scatterplot(x="出场时间", y="失误", data=df)
def Kmean(data):
from sklearn.cluster import KMeans
from sklearn import metrics
data=pca_data(data)
kmeans_model = KMeans(n_clusters=3, random_state=1).fit(data)
labels = kmeans_model.labels_
x=metrics.silhouette_score(data, labels, metric='euclidean')
print(x)
return labels
def team_data(path):
df=pd.read_csv(path+'本赛季.csv')
labels=Kmean(df)
labels=pd.DataFrame(labels)
labels=labels.rename(columns={
0:'球队等级'})
df_new=pd.concat([df['球队'],labels],axis=1,ignore_index=False)
print(df_new)
size=df_new.groupby(df_new['球队等级']).球队.count()
fig = plt.figure(figsize=(10,4))
ax1 = fig.add_subplot(121)
ax1.pie(size, labels=size.index, autopct='%.2f%%',startangle=-110,
colors=sns.color_palette("rainbow",size.shape[0]))
text_1=['球队等级=1:']
text_2=['球队等级=2:']
text_3=['球队等级=3:']
for i in range(len(df_new)):
if df_new.loc[i]['球队等级'] == 0:
text_1.append(df_new.loc[i]['球队'])
if df_new.loc[i]['球队等级'] == 1:
text_2.append(df_new.loc[i]['球队'])
if df_new.loc[i]['球队等级'] == 2:
text_3.append(df_new.loc[i]['球队'])
ax1.text(x=1,y=3,s=text_1)
ax1.text(x=1,y=2,s=text_2)
ax1.text(x=1,y=1,s=text_3)
if __name__ == "__main__":
path='D:/天下3/job/nba_data/'
data=get_data(path)
reduced_X=pca_data(data)
fit_data(reduced_X, data)
pred_data(path)
player_data(path)
team_data(path)
文章评论