HAFTA-1 - Data Manipulation with Python - PDF
Document Details
Uploaded by Deleted User
Tags
Summary
These lecture notes cover basic data manipulation techniques using Python's Pandas library. The content focuses on creating and manipulating dataframes for a variety of tasks. The sample demonstrates fundamental Python code for working with data structures such as lists and numbers.
Full Transcript
HAFTA-1 - data=\"Merhaba Dünya\" - data\[0\] \#\'M\' - data\[-1\] \#\'a\' - len(data) \#13 - a,b,c =10,-20,100 - print(a,b,c) \#10 -20 100 - notun =100 - if notun\>=90: print(\"AA\") elif notun\>=80: print(\"BB\") else: print(\"CC\") \#AA - for i in range (1,10): pri...
HAFTA-1 - data=\"Merhaba Dünya\" - data\[0\] \#\'M\' - data\[-1\] \#\'a\' - len(data) \#13 - a,b,c =10,-20,100 - print(a,b,c) \#10 -20 100 - notun =100 - if notun\>=90: print(\"AA\") elif notun\>=80: print(\"BB\") else: print(\"CC\") \#AA - for i in range (1,10): print(i) \#1 2 3 4 5 6 7 8 9 - i=0 while i\ - data.tail() \#sondan beş eleman ![](media/image4.png) - data.columns \#veri setimizin sutünu yanı featureni getirir. \#Index(\[\'ISIM\', \'YAS\', \'MAAS\'\], dtype=\'object\') - data.info() \#!!!!veri setin sütunlarını tipleri var non null boş değil toplam 3 elemanlı.Object veri tipinde varsa çalışmaz yapay zekada. \#\ - data.describe().T \#transpozunu alma ![](media/image6.png) - data.shape\#eleman sayısı \#(6, 3) - HAFTA-2 - import pandas as pd - sozluk= {\"ISIM\":\[\'ELİF\',\'ENES\',\'BATU\',\'METE\',\'GÖKTAY\',\'EREN\'\], \"YAS\":\[22,23,24,25,26,27\], \"MAAS\":\[10000,20000,30000,40000,50000,60000\]} Sozluk \#{\'ISIM\': \[\'ELİF\', \'ENES\', \'BATU\', \'METE\', \'GÖKTAY\', \'EREN\'\], \'YAS\': \[22, 23, 24, 25, 26, 27\], \'MAAS\': \[10000, 20000, 30000, 40000, 50000, 60000\]} - data=pd.DataFrame(sozluk) data - data.dtypes \#ISIM object YAS int64 MAAS int64 dtype: object - data.shape \#(6, 3) - data.YAS \#0 22 1 23 2 24 3 25 4 26 5 27 Name: YAS, dtype: int64 - data\[\"YAS\"\] \#0 22 1 23 2 24 3 25 4 26 5 27 Name: YAS, dtype: int64 - data\[\"AGNO\"\]=\[3.45,3,2.88,3.5,2.44,3\] data ![](media/image8.png) - data.loc\[:,\"YAS\"\] \#0 22 1 23 2 24 3 25 4 26 5 27 Name: YAS, dtype: int64 - data.loc\[:3,\"YAS\"\] \#0 22 1 23 2 24 3 25 Name: YAS, dtype: int64 - data.loc\[:3,\"YAS\":\"MAAS\"\] - data.loc\[::-1,:\] ![](media/image10.png) - filtre1=data.YAS\23 filtre1 \#0 False 1 False 2 True 3 True 4 True 5 True Name: YAS, dtype: bool - yeniData=data\[filtre1\] - filtre2=data.MAAS\25000 filtre2 \#0 False 1 False 2 True 3 True 4 True 5 True Name: MAAS, dtype: bool - yeniData2=data\[filtre1&filtre2\] yeniData2 ![](media/image12.png) - data\[\"Durum\"\]=\[\"GEÇTİ\" if i\>=3 else \"KALDI\"for i in data.AGNO\] data - \#SORU2 = YENİ sütun oluştur adı maaş durum maaşın ortalamasını hesapla ortalamadan çoksa çok değilse az ortalama=data.MAAS.mean() print(ortalama) \#35000.0 - data\[\"MAAŞ DURUM\"\]=\[\"ÇOK\" if i\>=ortalama else \"AZ\" for i in data.MAAS\] ![](media/image14.png) - data.drop(\[\"MAAŞ DURUM\"\],axis=1) data - data\[\"MAAŞ DURUM\"\]=\[\"ÇOK\" if i\>=ortalama else \"AZ\" for i in data.MAAS\] data ![](media/image16.png) - data.drop(\[\"MAAŞ DURUM\"\],axis=1,inplace=True) data - import numpy as np - data1=pd.DataFrame(np.random.randn(6,6), index=\[\"ELİF\",\"ENES\",\"BATU\",\"METE\",\"GÖKTAY\",\"EREN\"\], columns=\[\"Glukoz\",\"CRP\",\"ALT\",\"P\",\"Na\",\"D vit\"\])![](media/image18.png) - data1.loc\[\"ELİF\":\"BATU\",\[\"CRP\",\"P\"\]\]=np.nan data1 - data1.isnull().sum() \#Glukoz 0 CRP 3 ALT 0 P 3 Na 0 D vit 0 dtype: int64 - data1.isnull().sum().sum() \#6 - data1.info() \#\ Index: 6 entries, ELİF to EREN Data columns (total 6 columns): \# Column Non-Null Count Dtype \-\-- \-\-\-\-\-- \-\-\-\-\-\-\-\-\-\-\-\-\-- \-\-\-\-- 0 Glukoz 6 non-null float64 1 CRP 3 non-null float64 2 ALT 6 non-null float64 3 P 3 non-null float64 4 Na 6 non-null float64 5 D vit 6 non-null float64 dtypes: float64(6) memory usage: 508.0+ bytes - data1.fillna(value=0) ![](media/image20.png) - data1.fillna(value=1) - data1.fillna(method=\"ffill\")\ ![](media/image22.png) - data1.fillna(method=\"bfill\") \#tüm veriyi gez eksik olanları bul bunun yerine değerlerin ort yaz. - data1=data1.fillna(value=data1.mean()) - data1=data1.fillna(value=data1.mean(),inplace=True) - \#DUPLİCATE data2=pd.DataFrame({\"CRP\":\[1,1,2,2,3,3,5\], \"ALT\":\[10,10,100,100,500,500,500\], \"P\":\[\"E\",\"E\",\"H\",\"H\",\"Belirsiz\",\"Belirsiz\",\"Belirsiz\"\]}) - data2 ![](media/image24.png) - data2.duplicated() \#trueler duplicated demek. \#0 False 1 True 2 False 3 True 4 False 5 True 6 False dtype: bool - data2=data2.drop\_duplicates() data2 - data2.sort\_values(\"ALT\") ![](media/image26.png) - data1\[\"Hemoglobin\"\]=data1.ALT+data1.P - filtre =data1.Hemoglobin\>0.1 data1\[filtre\] - dataYeni=data1\[(data1.Hemoglobin\>0.1)&(data1.Na\>0.2)\] dataYeni HAFTA-3 - import numpy as np - import pandas as pd - data = pd.DataFrame(np.random.randn(3,3), index = \[\"hasta1\",\"hasta2\",\"hasta3\"\], columns=\[\"Glukoz\",\"D vit\",\"CRP\"\]) data - data.CRP \#hasta1 0.708697 hasta2 0.777567 hasta3 -0.879911 Name: CRP, dtype: float64 - data\[\"D vit\"\] \#hasta1 0.865644 hasta2 -0.201652 hasta3 -1.556837 Name: D vit, dtype: float64 - data\[\[\"D vit\",\"CRP\"\]\] ![](media/image28.png) - data\[\"ALT\"\]=data\[\"Glukoz\"\]\*2 data data.loc\[\[\"hasta1\",\"hasta3\"\],\[\"Glukoz\",\"CRP\"\]\] ![](media/image30.png) - filtre = data.CRP\>0.5 data\[filtre\] - data\[(data.CRP\>0.5)\] ![](media/image32.png) - data\[(data.CRP\>0.5)&(data\[\"D vit\"\]\>-1)\] - data\[\"P\"\]= np.random.rand(3) data ![](media/image34.png) - data\[\"index\"\]=\[\"i1\",\"i2\",\"i3\"\] data\[\"sinif\"\]=\[1,1,0\] data - data.set\_index(\"index\",inplace=True) \#kalıcı olması için dataya eşitle ya da inplace true demelisin data ![](media/image36.png) - Eksik Veri Tamamlama - data1=pd.DataFrame(np.random.randn(6,6), index=\[\"Mete\",\"Ferhat\",\"Buket\",\"Batu\",\"Göktay\",\"Ahmet\"\], columns=\[\"Glukoz\",\"CRP\",\"ALT\",\"P\",\"Na\",\"D vit\"\]) data1 - data1.loc\[\"Buket\":\"Göktay\",\[\"CRP\",\"P\"\]\]=np.nan - data1.isnull() - data1.isnull().sum().sum() \#6 - data1=data1.dropna(axis=1) - data1.isnull().sum() \#Glukoz 0 CRP 3 ALT 0 P 3 Na 0 D vit 0 dtype: int64 - data1.size \#36 - def OrtalamaIleDoldur(data): toplam = data.sum().sum() elemanSayısı = data1.size - data.isnull().sum().sum() ortalama = toplam / elemanSayısı return ortalama - deger = OrtalamaIleDoldur(data1) deger \#0.1390249997549787 - data1.fillna(value = deger) ![](media/image38.png) - data.mean(skipna=True) \#Glukoz -0.338019 D vit -0.297615 CRP 0.202118 ALT -0.676038 P 0.822002 sinif 0.666667 dtype: float64 - data1.fillna(value=data1.mean(skipna=True)) \#bu metodu kodla yaz. - data= pd.read\_csv(\"eksikveriler2.csv\") - data.head() ![](media/image40.png) - data.info() \#\ RangeIndex: 22 entries, 0 to 21 Data columns (total 5 columns): \# Column Non-Null Count Dtype \-\-- \-\-\-\-\-- \-\-\-\-\-\-\-\-\-\-\-\-\-- \-\-\-\-- 0 eegkanal 22 non-null object 1 frekans 22 non-null int64 2 genlik 22 non-null int64 3 yas 20 non-null float64 4 hasta 22 non-null object dtypes: float64(1), int64(2), object(2) memory usage: 1012.0+ bytes - data.describe() - data.shape \#(22, 5) - data.isnull().sum() \#eegkanal 0 frekans 0 genlik 0 yas 2 hasta 0 dtype: int64 - frekansGenlikYas = data.iloc\[:,1:4\] frekansGenlikYas ![](media/image42.png) - from sklearn.impute import SimpleImputer eksikVeriTamamlama =SimpleImputer(missing\_values=np.nan ,strategy=\"mean\") eksikVeriTamamlama = eksikVeriTamamlama.fit(data.iloc\[:,1:4\]) data.iloc\[:,1:4\] =eksikVeriTamamlama.transform(data.iloc\[:,1:4\]) - data\ data= pd.read\_csv(\"eksikveriler2.csv\") - from sklearn.impute import SimpleImputer - eksikVeriTamamlama =SimpleImputer(missing\_values=np.nan ,strategy=\"median\") - eksikVeriTamamlama = eksikVeriTamamlama.fit(data.iloc\[:,1:4\]) - data.iloc\[:,1:4\] =eksikVeriTamamlama.transform(data.iloc\[:,1:4\]) - data - data.isnull().sum().sum() \#0 - data= pd.read\_csv(\"eksikveriler2.csv\") from sklearn.impute import SimpleImputer eksikVeriTamamlama =SimpleImputer(missing\_values=np.nan ,strategy=\"median\") eksikVeriTamamlama = eksikVeriTamamlama.fit(data.iloc\[:,1:4\]) data.iloc\[:,1:4\] =eksikVeriTamamlama.transform(data.iloc\[:,1:4\]) data ![](media/image44.png) - data= pd.read\_csv(\"eksikveriler2.csv\") from sklearn.impute import SimpleImputer eksikVeriTamamlama =SimpleImputer(missing\_values=np.nan ,strategy=\"most\_frequent\") eksikVeriTamamlama = eksikVeriTamamlama.fit(data.iloc\[:,1:4\]) data.iloc\[:,1:4\] =eksikVeriTamamlama.transform(data.iloc\[:,1:4\]) - data= pd.read\_csv(\"eksikveriler2.csv\") from sklearn.impute import SimpleImputer eksikVeriTamamlama =SimpleImputer(strategy=\"constant\",fill\_value=40) eksikVeriTamamlama = eksikVeriTamamlama.fit(data.iloc\[:,1:4\]) data.iloc\[:,1:4\] =eksikVeriTamamlama.transform(data.iloc\[:,1:4\]) data ![](media/image46.png) - data.info() \#\ RangeIndex: 22 entries, 0 to 21 Data columns (total 5 columns): \# Column Non-Null Count Dtype \-\-- \-\-\-\-\-- \-\-\-\-\-\-\-\-\-\-\-\-\-- \-\-\-\-- 0 eegkanal 22 non-null object 1 frekans 22 non-null int64 2 genlik 22 non-null int64 3 yas 22 non-null float64 4 hasta 22 non-null object dtypes: float64(1), int64(2), object(2) memory usage: 1012.0+ bytes - \#egg kanalı içindeki değerlerden kaçar tane var? - data.eegkanal.value\_counts() \#eegkanal F3-C3 9 T4-T6 7 F4-C4 6 Name: count, dtype: int64 - from sklearn.preprocessing import LabelEncoder le = LabelEncoder() hastalik = data.iloc\[: ,-1\].values hastalik = le.fit\_transform(hastalik) hastalik \#array(\[0, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1\]) - dfHastalik = pd.DataFrame(data = hastalik , index = range(22), columns = \[\"Hastalik\"\]) dfHastalik - eeg = data.iloc\[:,0:1\].values eeg ![](media/image48.png) - from sklearn.preprocessing import LabelEncoder le = LabelEncoder() eeg \[:,0\] = le.fit\_transform( data.iloc\[:,0:1\]) eeg \#array(\[\[0\], \[0\], \[0\], \[0\], \[0\], \[0\], \[0\], \[0\], \[0\], \[1\], \[1\], \[1\], \[1\], \[1\], \[1\], \[2\], \[2\], \[2\], \[2\], \[2\], \[2\], \[2\]\], dtype=object) - from sklearn.preprocessing import OneHotEncoder one=OneHotEncoder() eeg= ohe.fit\_transform(data.iloc\[:0,1\]) eeg - eeg\_reshaped = eeg.reshape(22, 3) dfEEG = pd.DataFrame(data=eeg\_reshaped, index=range(22), columns=\[\"F3-C3\", \"T4-T6\", \"F4-C4\"\]) - frekansGenlikYas = data.iloc\[:,1:4\].values frekansGenlikYas dffrekansGenlikYas =pd.DataFrame(data=frekansGenlikYas, index=range(22), columns=\[\"Frekans\",\"Genlik\",\"Yas\"\]) dffrekansGenlikYas - dataSon = pd.concat(\[dfEEG,dffrekansGenlikYas,defHastalik\],axis=1) dataSon - Hafta 4 - import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model\_selection import train\_test\_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy\_score ,classification\_report - iris = pd.read\_csv(\"iris.csv\") iris.head() - iris.info() \#\ RangeIndex: 150 entries, 0 to 149 Data columns (total 6 columns): \# Column Non-Null Count Dtype \-\-- \-\-\-\-\-- \-\-\-\-\-\-\-\-\-\-\-\-\-- \-\-\-\-- 0 Id 150 non-null int64 1 SepalLengthCm 150 non-null float64 2 SepalWidthCm 150 non-null float64 3 PetalLengthCm 150 non-null float64 4 PetalWidthCm 150 non-null float64 5 Species 150 non-null object dtypes: float64(4), int64(1), object(1) memory usage: 7.2+ KB - iris.tail() ![](media/image50.png) - iris.describe() - iris.shape \#(150, 6) - iris.columns \#Index(\[\'Id\', \'SepalLengthCm\', \'SepalWidthCm\', \'PetalLengthCm\', \'PetalWidthCm\', \'Species\'\], dtype=\'object\') - iris.dtypes \#Id int64 SepalLengthCm float64 SepalWidthCm float64 PetalLengthCm float64 PetalWidthCm float64 Species object dtype: object - iris.isnull().sum() \#Id 0 SepalLengthCm 0 SepalWidthCm 0 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 - \#ÇOKKKKKKKKK ÖNEMLİİİİİİİİ 2 METHODDDDD - iris.Species.unique() \#array(\[\'Iris-setosa\', \'Iris-versicolor\', \'Iris-virginica\'\], dtype=object) - iris.Species.value\_counts() - Species - Iris-setosa 50 - Iris-versicolor 50 - Iris-virginica 50 - Name: count, dtype: int64 - setosa = iris \[iris.Species == \"Iris-setosa\"\] \#ilk 50 veri - versicolor = iris \[iris.Species == \"Iris-versicolor\"\] \#sonraki 50 veri - virginica = iris \[iris.Species == \"Iris-viriginica\"\] \#son 50 veri - iris\_yeni = iris.drop(\[\"Id\"\],axis = 1) iris\_yeni.plot() plt.show() \#PetalWidth ayrılmış ve etkili bu yüzden plt.plot(setosa.Id , setosa.SepalLengthCm ,color = \"Red\", label =\"Setosa\") plt.xlabel(\"Id\") plt.ylabel(\"SepalLengthCm\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() plt.plot(setosa.Id , setosa.SepalLengthCm ,color = \"Red\", label =\"Setosa\",linewidth =1) plt.plot(versicolor.Id , versicolor.SepalLengthCm ,color = \"blue\", label =\"Setosa\",linewidth =1) plt.plot(virginica.Id , virginica.SepalLengthCm ,color = \"green\", label =\"Setosa\",linewidth =1) plt.xlabel(\"Id\") plt.ylabel(\"SepalLengthCm\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() \#mediumdan renk kodlarına bakabilirsin. plt.plot(setosa.Id , setosa.PetalLengthCm ,color = \"Red\", label =\"Setosa\",linewidth =1) plt.plot(versicolor.Id , versicolor.PetalLengthCm ,color = \"blue\", label =\"Setosa\",linewidth =1) plt.plot(virginica.Id , virginica.PetalLengthCm ,color = \"green\", label =\"Setosa\",linewidth =1) plt.xlabel(\"Id\") plt.ylabel(\"PetaLengthCm\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() plt.scatter(setosa.SepalLengthCm , setosa.SepalWidthCm ,color = \"Red\", label =\"Setosa\",linewidth =1) \#çok iyi ayrılıyor plt.plot(versicolor.SepalLengthCm , versicolor.SepalWidthCm ,color = \"blue\", label =\"versicolor\",linewidth =1) plt.plot(virginica.SepalLengthCm , virginica.SepalWidthCm ,color = \"green\", label =\"virginica\",linewidth =1) plt.xlabel(\"Id\") plt.ylabel(\"SepalLengthCm\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() plt.scatter(setosa.PetalLengthCm , setosa.PetalWidthCm ,color = \"Red\", label =\"Setosa\",linewidth =1) \#çok iyi ayrılıyor plt.plot(versicolor.PetalLengthCm , versicolor.PetalWidthCm ,color = \"blue\", label =\"veriscolor\",linewidth =1) plt.plot(virginica.PetalLengthCm , virginica.PetalWidthCm ,color = \"green\", label =\"virginica\",linewidth =1) plt.xlabel(\"Id\") plt.ylabel(\"SepalLengthCm\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() plt.hist(setosa.Sepal,color =\"green\",label =\"Setosa\") plt.xlabel(\"SepalLength\") plt.ylabel(\"Frekans\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() plt.subplot(2,1,1) plt.plot(setosa.Id , setosa.SepalLengthCm ,color = \"Red\", label =\"Setosa\",linewidth =1) plt.xlabel(\"SepalLength\") plt.ylabel(\"Frekans\") plt.title(\"Setosa Sepal Length Dağılımı\") plt.legend() plt.show() sns.pairplot(iris , hue =\"Species\") plt.show() iris\_yeni=iris.drop(\[\"Species\"\],axis=1) korelasyon=iris\_yeni.corr() print(korelasyon) sns.heatmap(korelasyon,annot =True) plt.show() \#bire yakın olması iyi. - iris.isnull().sum() Id 0 SepalLengthCm 0 SepalWidthCm 0 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 - iris.iloc\[0,1\] =np.nan - iris.iloc\[0,2\] =np.nan - iris.isnull().sum() \#Id 0 SepalLengthCm 1 SepalWidthCm 1 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 - iris\_ortalama = iris.copy() iris\_medyan = iris.copy() iris\_mod =iris.copy() - iris\_ortalama.fillna(iris.mean(numeric\_only = True),inplace = True) iris\_medyan.fillna(iris.mean(numeric\_only = True),inplace = True) iris\_mod.fillna(iris.mean(numeric\_only = True),inplace = True) print(iris\_ortalama.isnull().sum()) print(iris\_medyan.isnull().sum()) print(iris\_mod.isnull().sum()) \#Id 0 SepalLengthCm 0 SepalWidthCm 0 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 Id 0 SepalLengthCm 0 SepalWidthCm 0 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 Id 0 SepalLengthCm 0 SepalWidthCm 0 PetalLengthCm 0 PetalWidthCm 0 Species 0 dtype: int64 - iris\_medyan.head() ![](media/image52.png) - sns.boxplot(x = \"Species\" , y = \"PetalLengthCm\", data=iris) plt.show() - from sklearn.model\_selection import train\_test\_split from sklearn.ensemble import RandomForestClassifier from sklearn.metrics import accuracy\_score ,classification\_report iris = pd.read\_csv(\"iris.csv\") X = iris.drop(\[\"Id\"\],axis = 1) y = iris\[\"Species\"\] X\_train ,X\_test ,y\_train ,y\_test =train\_test\_split(X,y,test\_size =0.3,random\_state=42) - model = RandomForestClassifier() model.fit(X\_train,y\_train) y\_pred = model.predict(X\_test,y\_test) accuracy = accuracy\_score(y\_test,y\_pred) print(\"Accuracy\",accuracy) rapor = classification\_report(y\_test,y\_pred) print(\"Classification Report:\\n\",rapor) - \#HAFTA 5 - import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns - data = pd.read\_csv(\"Pokemon.csv\") - data.head() ![](media/image54.png) - data.info() \#\ RangeIndex: 151 entries, 0 to 150 Data columns (total 13 columns): \# Column Non-Null Count Dtype \-\-- \-\-\-\-\-- \-\-\-\-\-\-\-\-\-\-\-\-\-- \-\-\-\-- 0 \# 151 non-null int64 1 Name 151 non-null object 2 Type 1 151 non-null object 3 Type 2 67 non-null object 4 Total 151 non-null int64 5 HP 151 non-null int64 6 Attack 151 non-null int64 7 Defense 151 non-null int64 8 Sp. Atk 151 non-null int64 9 Sp. Def 151 non-null int64 10 Speed 151 non-null int64 11 Stage 151 non-null int64 12 Legendary 151 non-null bool dtypes: bool(1), int64(9), object(3) memory usage: 14.4+ KB - data.head() - data.describe() ![](media/image56.png) - data.tail() - data.isnull().sum() - data.isnull().sum().sum() \#84 - data.Legendary.unique() \#kaç sınıflı \#array(\[False, True\]) - data.Legendary.value\_counts() \#Legendary False 147 True 4 Name: count, dtype: int64 - data\[\"Type 1\"\].unique() \#array(\[\'Grass\', \'Fire\', \'Water\', \'Bug\', \'Normal\', \'Poison\', \'Electric\', \'Ground\', \'Fairy\', \'Fighting\', \'Psychic\', \'Rock\', \'Ghost\', \'Ice\', \'Dragon\'\], dtype=object) - data\[\"Type 2\"\].unique() \#array(\[\'Poison\', nan, \'Flying\', \'Ground\', \'Fairy\', \'Grass\', \'Fighting\', \'Psychic\', \'Steel\', \'Ice\', \'Rock\', \'Water\'\], dtype=object) - data.head(20) ![](media/image58.png) - data\[\"Type 2\"\].fillna(data\[\"Type 1\"\],inplace = True) \#kontrol et data.head(20) data = data.drop(\[\"\#\"\],axis = 1) data.head() ![](media/image60.png) - korelasyon = data.corr(numeric\_only = True) plt.figure(figsize =(12,8)) sns.heatmap(korelasyon,annot = True,cmap =\"coolwarm\",linewidths = 4) plt.show() - data.Speed.plot(color = \"r\",label =\"hız\",linestyle = \"-\",) data.Defense.plot(color = \"g\",label = \"Defans\",linestyle = \":\") plt.xlabel(\"X ekseni\") plt.ylabel(\"Y label\") plt.title(\"Başlık\") plt.legend() plt.show() ![](media/image62.png) - data.plot(kind =\"scatter\",x=\"Attack\",y=\"Defense\",color=\"r\") data.xlabel=\"Attack\" data.ylabel =\"Defense\" plt.show() - data.plot(kind =\"hist\",y=\"Attack\",bins=50,figsize=(12,8)) plt.show() ![](media/image64.png) - data.tail(20) - data\[(data\[\"Total\"\] \> 495) & (data\[\"Sp.Atk\"\] \> 55)\] - data.Generation.value\_counts() - ates = \[data\[data\[\"Type 1\"\]\]==\"False\"\] su = \[data \[\"Type 1\"\] == \"Water\"\] plt.scatter(ates\[\"Attack\",ates\[\"Defense\"\],color =\"r\",label=\"Fire\"\]) plt.scatter(su\[\"Attack\",ates\[\"Defense\"\],color =\"r\",label=\"Fire\"\]) plt.show() - data1 = data.drop(\[\"Legendary\",\"Generation\"\],axis = 1) sns.boxplot(data=data1) plt.show() - import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.preprocessing import StandardScaler from sklearn.model\_selection import train\_test\_split from sklearn.preprocessing import LabelEncoder from sklearn.neighbors import KNeighborsClassifier from sklearn.metrics import accuracy\_score,classification\_report data = pd.read\_csv(\"Pokemon.csv\") data ![](media/image66.png) - data = data.drop(\[\"\#\"\],axis =1 ) data = data.drop(\[\"Name\"\],axis =1 ) - encoder = LabelEncoder() data \[\"Type 1 \"\] = encoder.fit\_transform(data\[\"Type 1\"\]) data \[\"Type 2\"\] = data\[\"Type 2\"\].fillna(\"None\") data \[\"Type 2\"\] = encoder.fit\_transform(data\[\"Type 2\"\]) data - X =data.drop(\[\"Legendary\"\],axis=1) y= data\[\"Legendary\"\].astype(int) y \#0 0 1 0 2 0 3 0 4 0.. 146 0 147 0 148 0 149 1 150 0 Name: Legendary, Length: 151, dtype: int32 - X\_train,X\_test,y\_train,y\_test = train\_test\_split(X,y,test\_size =0.3,random\_state=42) scaler = StandardScaler() X\_train\_scaled = scaler.fit\_transform(X\_train) X\_test\_scaled =scaler.transform(X\_test) knn = KNeighborsClassifier(n\_neighbors=7) knn.fit(X\_train\_scaled,y\_train) y\_pred = knn.predict(X\_test\_scaled) accuracy = accuracy\_score(y\_test,y\_pred) print(\"Accuracy\",accuracy) sınıflandırmaRaporu = classification\_report(y\_test,y\_pred) print(\"Sınıflandırma Raporu\", sınıflandırmaRaporu) - knn = KNeighborsClassifier(n\_neighbors=7) knn.fit(X\_train\_scaled,y\_train) y\_pred = knn.predict(X\_test\_scaled) accuracy = accuracy\_score(y\_test,y\_pred) print(\"Accuracy\",accuracy) sınıflandırmaRaporu = classification\_report(y\_test,y\_pred) print(\"Sınıflandırma Raporu\", sınıflandırmaRaporu) - \#Normalizasyon anlamlı çıkmadı.\ X =data.drop(\[\"Legendary\",\"Type 1\",\"Type 2\"\],axis=1) y= data\[\"Legendary\"\].astype(int) X\_train,X\_test,y\_train,y\_test = train\_test\_split(X,y,test\_size =0.3,random\_state=42) scaler = StandardScaler() X\_train\_scaled = scaler.fit\_transform(X\_train) X\_test\_scaled =scaler.transform(X\_test) knn = KNeighborsClassifier(n\_neighbors=15) knn.fit(X\_train\_scaled,y\_train) y\_pred = knn.predict(X\_test\_scaled) accuracy = accuracy\_score(y\_test,y\_pred) print(\"Accuracy\",accuracy) sınıflandırmaRaporu = classification\_report(y\_test,y\_pred) print(\"Sınıflandırma Raporu\", sınıflandırmaRaporu) \#HAFTA 6 - \#Linear Regresyon :Bağımlı değişken ile bağımsız değişken arasındaki ilişki. - \#y=b0+b1x - \#y tahmin edeceğimiz değer. - \#R2 1 e ne kadar yakınsa o kadar iyi. - import numpy as np import pandas as pd import matplotlib.pyplot as plt import seaborn as sns - data =pd.read\_csv(\"linear-regression-dataset.csv\") data ![](media/image68.png) - plt.scatter(data.deneyim,data.maas) plt.xlabel(\"Deneyim\") plt.ylabel(\"Maaş\") plt.show() from sklearn.linear\_model import LinearRegression tahmin = LinearRegression() \#reshape yerine : X=\[\[\"deneyim\"\]\] \#y=data\[\"maaş\"\] X = data.deneyim.values.reshape(-1,1) y = data.maas.values.reshape(-1,1) tahmin.fit(X,y) skor = tahmin.score(X,y) print(skor) \#0.9775283164949902 - b1 = tahmin.coef\_ print(b1) b0=tahmin.intercept\_ print(b0) print(\"Regresyon denklemi y=\" +str(b0) +\"+\"+str(b1)+\"x\") \#\[\[1138.34819698\]\] \[1663.89519747\] Regresyon denklemi y=\[1663.89519747\]+\[\[1138.34819698\]\]x - deneyim9 = 1663.89519747+1138.34819698\*9 deneyim9 \#11909.028970289999 - deneme =np.array(9).reshape(-1,1) deneyim9=tahmin.predict(deneme) deneyim9 \#array(\[\[11909.02897025\]\]) - deneyim9 =tahmin.predict(\[\[9\]\]) \#sonucu yazdırırken iki boyutlu dizi olarak düşündüğü için iki kare parantez deneyim9 \#array(\[\[11909.02897025\]\]) - deneyim10 = 1663.89519747+1138.34819698\*10 deneyim10 \#başarılı. \#13047.377167269999 - array = np.array(\[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15\]).reshape(-1,1) plt.scatter(X,y) tahminEdilen = tahmin.predict(array) plt.plot(array,tahminEdilen,color=\"red\") plt.show() ![](media/image70.png) - from sklearn.metrics import r2\_score from sklearn.metrics import mean\_squared\_error from sklearn.metrics import root\_mean\_squared\_error from sklearn.metrics import mean\_absolute\_error from sklearn.metrics import mean\_absolute\_percentage\_error from sklearn.model\_selection import train\_test\_split data =pd.read\_csv(\"linear-regression-dataset.csv\") - lr = LinearRegression() X = data.deneyim.values.reshape(-1,1) y = data.maas.values.reshape(-1,1) X\_train,X\_test,y\_train,y\_test=train\_test\_split(X,y,test\_size=0.2,random\_state=0) lr.fit(X\_train,y\_train) y\_pred = lr.predict(X\_test) mse = mean\_squared\_error(y\_test,y\_pred) print(\"MSE\",mse) rmse0 = np.sqrt(mse) print(\"RMSE0\",rmse0) rsme= root\_mean\_squared\_error(y\_test,y\_pred) print(\"RMSE\",rsme) def rsme\_hesapla(y\_test,y\_pred): return np.sqrt(mean\_squared\_error(y\_test,y\_pred)) rsme2 =rsme\_hesapla(y\_test,y\_pred) print(\"RMSE2:\",rsme2) r2=r2\_score(y\_test,y\_pred) print(\"R2\",r2) mae =mean\_absolute\_error(y\_test,y\_pred) print(\"MAE\",mae) mape = mean\_absolute\_percentage\_error(y\_test,y\_pred) print(\"MAPE\",mape) \#MSE 1972063.0531601794 RMSE0 1404.3016247089438 RMSE 1404.3016247089438 RMSE2: 1404.3016247089438 R2 0.9455565414771729 MAE 1363.6851222599787 MAPE 0.13733128641333722 - from sklearn.linear\_model import Ridge, Lasso, ElasticNet from sklearn.tree import DecisionTreeRegressor from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor from xgboost import XGBRegressor from lightgbm import LGBMRegressor from catboost import CatBoostRegressor from sklearn.neighbors import KNeighborsRegressor from sklearn.svm import SVR model = { \"Linear Regression\": LinearRegression(), \"Ridge\": Ridge(), \"Lasso\": Lasso(), \"ElasticNet\": ElasticNet(), \"Decision Tree\": DecisionTreeRegressor(), \"Random Forest\": RandomForestRegressor(), \"XGBoost\": XGBRegressor(), \"LightGBM\": LGBMRegressor(verbose=0), \"CatBoost\": CatBoostRegressor(verbose=0), \"Gradient Boosting\": GradientBoostingRegressor(), \"KNeighbors\": KNeighborsRegressor(), \"SVR\": SVR() }