CA325 Pandas basics.pdf
Document Details
Uploaded by Deleted User
Tags
Full Transcript
CA325: BASIC CONCEPTS OF PANDAS Pandas provides two types of classes for handling data: 1. Series: a one-dimensional labelled array holding data of any type such as integers, strings, Python objects etc. 2. DataFrame: a two-dimensional data structure that holds data l...
CA325: BASIC CONCEPTS OF PANDAS Pandas provides two types of classes for handling data: 1. Series: a one-dimensional labelled array holding data of any type such as integers, strings, Python objects etc. 2. DataFrame: a two-dimensional data structure that holds data like a two-dimension array or a table with rows and columns. #CREATING SERIES OF DATA import pandas as pd import numpy as np #Creating a series of data s = pd.Series([1, 3, 5, np.nan, 6, 8]) print(s) #Series Generation import pandas as pd import numpy as np #Series Generation a=pd.Series(np.random.randn(5)) b=pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e']) print(a) print(b) ------------------------------- import pandas as pd import numpy as np #Generate random number from 0 to 7 - total 10 numbers a=pd.Series(np.random.randint(0,7, size =10)) print(a) print(a.value_counts()) ---------------- import pandas as pd #import numpy as np s1=pd.Series(range(0,4)) s2=pd.Series(range(1,5)) print(s1) print(s2) print(s1+s2) # OPERATIONS ON SERIES import pandas as pd import numpy as np #Generate random number from 0 to 7 - total 10 numbers a=pd.Series(np.random.randint(0,7, size =5)) print(a) print("value count") print(a.value_counts()) print("size ") print(a.size) print("count series") print(a.count()) print("describe ") print(a.describe()) #Print 6 dates import pandas as pd dates = pd.date_range("2024", periods=6) print(dates) import pandas as pd import numpy as np # Creating Data frame from series d = {'c1': pd.Series(['A', 'B', 'C']), 'c2': pd.Series([1, 2, 3, 4])} df = pd.DataFrame(d) print(df) IF we change the series d = {'c1': pd.Series(['A', 'B', 'C']), 'c2': pd.Series([1, 2., 3., 4])} Then output will be in dot 1.0,2.0 and so on import pandas as pd import numpy as np # Creating Data frame from random numbers df = pd.DataFrame(np.random.randn(10,4)) print(df) # Creating Data frame from random numbers with named columns df = pd.DataFrame(np.random.randn(5,2), columns =["A", "B"]) print(df) # CREATING DATAFRAME USING RANDOM command import numpy as np import pandas as pd N = 30 gen = np.random.choice(['Male', 'Female'], N, p=[0.6, 0.4]) height = 140 + (200-140) * np.random.rand(N) weight = 40 + (120-40) * np.random.rand(N) salary = 30000+(80000-30000) * np.random.rand(N) df = pd.DataFrame(data=[gen, height, weight, salary]).transpose() df.columns = ["Gen", "Height", "weight", "salary"] print(df.columns) #print(df) for i in df.columns[1:]: df[i]= df[i].astype(float).round(2) print(df) #CREATING DATAFRAME FROM KEY AND VALUE import pandas as pd #import numpy as np df2 = pd.DataFrame( { "A": 1.0, "B": pd.Timestamp("20240507"), "C": pd.Series(3, index=list(range(4)), dtype="float32"), "E": pd.Categorical(["test", "train", "test", "train"]), "F": "Good", } ) # Print data print(df2) ------------------------------------------------ # Print datatypes print(df2.dtypes) # print total columns print(df2.columns) #Print top rows print(df2.head()) #Print index data print(df2.index) #Print statistics print(df2.describe()) # CREATING DATAFRAME USING LIST AND DICTIONARY import pandas as pd #import numpy as np df = pd.DataFrame( { "id": [1,2,3,4,5,6], "grade": ["a","b","b","a","a","b"], } ) # Print data print(df.head()) ------------------------------------------------- # Print data print(df.head()) df["g1"]= df["grade"].astype("category") print(df["g1"]) df.insert(2, "b", "ok") print(df) print(df.columns) # CREATING DATAFRAME USING CSV file # SET 1: Importing libraries #import numpy as np import pandas as pd # Reading file # STEP 2: Import the dataset from assigned path path = 'D:/amisha/2024-25/CA325-Data Science/Course File/Practical/fruit-data.csv' # STEP3: READ the file df = pd.read_csv(path) #STEP4 : Print the first five lines print(df.head(5)) --------------------------------------------------------- print(df.columns) Now Target is to be separated from dataset x = df.iloc[:, :-1].values print(x) - Prints all data except target y = df.iloc[:, 6].values print(y) - Prints target – Fruit label data