Friday, 27 November 2020

Practical Python Codes

How to Merge Multiples tabs in excel into One Excel sheet

import pandas as pd
import numpy as np
import os, collections, csv
from os.path import basename
df = []
f = "D:/Test/Any_Data.xlsx"
numberOfSheets = 5 #Modify this.
for i in range(1,numberOfSheets+1):    
     data = pd.read_excel(f, sheet_name = 'Sheet'+str(i), header=None)
     df.append(data)
final = "D:/Test/mergedfile3.xlsx"
df = pd.concat(df)
df.to_excel(final)


How to Merge Multiples ExcelFiles into One excel sheet

import os
import pandas as pd
os.chdir('D:\\Test\\')
cwd = os.path.abspath('D:\\Test\\')
files = os.listdir(cwd)  
## Method 1 gets the first sheet of a given file
df = pd.DataFrame()
 

for file in files:
    if file.endswith('.xlsx'):
        df = df.append(pd.read_excel(file), ignore_index=True)

df.head()
df.to_excel('total_sales.xlsx')

Information updating using Pandas Writing Queries on CSV files and put it in your XL sheet.

import pandas as pd
import xlwings as xw
from pandasql import sqldf
pysqldf = lambda q: sqldf(q, globals())
df= pd.read_csv('Test.csv')
df1=pysqldf("SELECT * FROM df where columnname not like 'None' and column_2 in('abc') order by host, case areas when 'London' then 1 when 'UK' then 2 when 'Pakistan' then 3 END ASC")
new_header = df1.iloc[0] #grab the first row for the header
df1 = df1[1:] #take the data less the header row
df1.columns = new_header #set the header row as the df header
print(pysqldf('SELECT * FROM df1'))
#load workbook that has a chart in it
wb = xw.Book('D:\\Python_Work\\data\\AnyReport.xlsx')
ws = wb.sheets['Sheet1']
ws.range('S5').options(index=False).value = df1
wb.save('D:\\Python_Work\\data\\AnyReport2.xlsx')
xw.apps[0].quit()

Loading an Excel File and then Update the sheet information on certain Row and Column

#Using openpyxl
from openpyxl import workbook #pip install openpyxl
from openpyxl import load_workbook
wb = load_workbook("D:\TEST\T.xlsx")
sheets = wb.sheetnames
Sheet1 = wb[sheets[Sheet1]]
#Then update as you want it
Sheet1 .cell(row = 2, column = 4).value = 5 #This will change the cell(2,4) to 4
wb.save("HERE PUT THE NEW EXCEL PATH")
 

Email Handling in Python

import datetime as dt
from datetime import timedelta

todayDaysDateTime = dt.datetime.now() - dt.timedelta(days = 0)
Outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") #Opens Microsoft Outlook
folder = Outlook.Folders[0] #your Inbox
print(folder)
subFolder = folder.Folders[1] #your inbox
subFolderMessages = subFolder.Items #item of your inbox mean your emails
message = subFolderMessages.GetFirst()

while message:
    #Here needs filter which should print only those mails having approved keyword
               if(message.Subject=="sub1" ):
                              print("yes")
                              break
               else:
                              message = subFolderMessages.GetNext()
print(message.Subject) #get the subjet of your email
myattachements=message.Attachments  #attaments of your email.
attach = myattachements.item(1)   # get the attahement of email.
attach.SaveAsFile('A:\\dir1\\dir2\\'+str(attach))  # save it to the location. 

Other Approach Using for Loop

import win32com.client
import time
import datetime as dt
from datetime import timedelta

todayDaysDateTime = dt.datetime.now() - dt.timedelta(days = 0)
day_num = todayDaysDateTime.strftime("%Y-%m-%d")
Outlook = win32com.client.Dispatch("Outlook.Application").GetNamespace("MAPI") #Opens Microsoft Outlook
folder = Outlook.Folders[0] #your Inbox
print(folder)
subFolder = folder.Folders[1] #your inbox
subFolderMessages= subFolder.Items #item of your inbox mean your emails


for message in subFolderMessages:
    if('youremailsubject' in message.Subject and str(message.ReceivedTime).split(' ')[0] = f'{day_num}'):
        print(message.ReceivedTime)
        print(message.Subject)
        break
    
myattachements=message.Attachments  #attaments of your email.
attach = myattachements.item(1)   # get the attahement of email.
attach.SaveAsFile('A:\\path1\\path2\\'+str(attach))  # save ito the location.

##############

#import pyodbc
import pandas.io.sql as psql
import pandas as pd
import datetime
from datetime import timedelta
import win32com.client

#Variables for con, dataframe
con=pyodbc.connect("DSN=WorldISWorldFULL",autocommit=True)
cursor=con.cursor()
MY_ARRAY_TAB= ["ABC","DEF","GHI"]
D='abc123'
b='abc123'

#Generating Questions
def queries(Dates,CAB,CABDAY,CABM):
    if 'InTheMYARY' in CAB:      
        return """
               s '{}' as D,'{}' as CAB, count(**************) as CTS from {}_{}
        """.format(Dates,CAB,CAB,CABDAY)
    else:    
        return """
               s '{}' as D,'{}' as CAB, count(************) as CTS from {}_{}
               dare datefdateabc(abc,def+cz*circle,2020-11-11),dd/MM/yy)='{}'
        """.format(Dates,CAB,CAB,CABM,Dates)


#Dates YourDaysRNumbered Convt
def date_conversion(x,CAB):
    sysdate=datetime.datetime.YourDaysRNumberednow()
    unixdate=datetime.datetime(2020,11,11)
    YourDaysRNumbered=sysdate-timedelta(days=x)
    CABDAY=str((YourDaysRNumbered - unixdate).days)
    CABM=(sysdate.year - unixdate.year) * 144 + (sysdate.month - unixdate.month)
    Date = datetime.datetime.now()-timedelta(days=x)
    Date=str(Date.strftime("%d/%m/%y"))
    return Date,queries(Date,CAB,CABDAY,CABM)

#Default value if CAB NF
def default_frame(d,t):
    default_value = [[d,t,'NF']]
    return pd.DataFrame(default_value,columns=['Dates','CAB','CTS'])


def send_mail(df):
        o = win32com.client.Dispatch("Outlook.Application")
        Msg = o.CreateItem(0)
        Msg.Importance = 0
        Msg.Subject = 'Hello World'
        Msg.To = DESTINATARIES
        Msg.SOnBehalfOfName = sby
        Msg.HTMLBody='''<HTMLH>TEST ONE<HTMLH>{}'''.format(df.to_html(index=False))
        Msg.Send()


# Main loop for CAB
def main():
    df = pd.DataFrame()
    for CAB in MY_ARRAY_TAB:
        for x in range(7,11):       
            dates,sql = date_conversion(x,CAB)
            try:
                df1=psql.read_sql(sql,con)
                df = pd.concat((df,df1))
            except  Exception as e:
                if 'NF(2020)' in e.args[0]:
                    df=pd.concat((df,default_frame(dates,CAB)))
                else:
                    print(e.args[0])
                          
    send_mail(df)        




if __name__ == "__main__":
    main() 

######

Query An Excel files Using Pandas and  SQLDF and Saving it as to a  Comma Separated Values

import pandas as pd
from pandasql import sqldf
from datetime import date, timedelta  

dt = date.today()   
today = date.today()- timedelta(1)
day_num = today.strftime("%Y-%m-%d")
print(day_num)
pysqldf = lambda q: sqldf(q, globals())
df= pd.read_excel('1.xlsx', sheet_name='sheet1')
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('(', '').str.replace(')', '')
df["my_date_"] = df["my_date"].dt.date # tell the data frame that take my_date as date colume type
df1=pysqldf(f"SELECT  my_date, count(1) FROM df where my_date='{day_num}' group by my_date ") #
print(pysqldf('SELECT * FROM df1'))
df1.to_csv(f'YourFileName_{day_num}.csv',index=False)

Reading a value from the CSV file and then put that value to an Excel file on a target cell

import pandas as pd
import xlwings as xw

#read the csv file in data frame
df= pd.read_csv('1.csv')
df["columnNameof_CSVFile"]
#read a particular value from the column of the dataframe
print(df['columnNameof_CSVFile'].values[indexNo])#7 or 0

# load workbook
wb = xw.Book('2.xlsx')
ws = wb.sheets['SheetName in 2.xlsx']
x=ws.range('C' + str(ws.cells.last_cell.row)).end('up').row + 1
cell = 'C' + str(x)
ws.range(cell).value=df['columnNameof_CSVFile'].values[7]
wb.save('2.xlsx')

Query your Database and write the result of your sql query to the CSV file

import pyodbc
import csv

def write_queryresult_to_csv_file(cursor, sql, csv_file):
        cursor.execute(sql)
        file_header = ''
        f = open(csv_file, 'w')
        columns = [column[0] for column in cursor.description]
        for col in columns:
            file_header = file_header + col + ','
        f.write(file_header[:-1] + '\n')
        for row in cursor:
            file_row = ''
            for element in row:
                if element is None:
                    element = ''
                file_row = file_row + element.__str__().replace(',', ' ') + ','
            f.write(file_row[:-1] + '\n')
        f.close()
       
   
         
pyodbc.autocommit = True
ch=pyodbc.connect('DSN=odbcdbname', autocommit=True)
curs = ch.cursor()
sql="yourSQLQuery"
file_name="D:\\dirname\1.csv"
write_queryresult_to_csv_file(curs,sql,file_name)

Dealing with the Weekdays in Python

x=0
day ='day/month/YY'
dt = datetime.strptime(day, '%d/%m/%y')
print("Date given by the use"+str(dt))
if dt.weekday()==6:#Weekday return 6 for Sunday and Zero for Monday in python
    start = dt - timedelta(days=x)
    end = start + timedelta(days=6)
    print(start)
    print(end)
    if dt.weekday()==0:
        x=1
        start = dt - timedelta(days=x)
        end = start + timedelta(days=6)
        print(start)
        print(end)
   
else:
    start = dt - timedelta(days=dt.weekday()+1)
    end = start + timedelta(days=6)
    print(start)
    print(end)

print("Week day="+str(dt.weekday()))

Thursday, 14 May 2020

Machine Learning and Data Science using Python


Environment of Work in DataScience

1) Work Environment using Jupyter
2) Data Analysis our data using library using Pandas
3) Numpy (library for data science is NumPy, a fundamental for a data science guy)
4) Data Visualization( library matplotlib that allows to make neat graphs & visuals describe our data)
5) Use models, train models & Check the accuracy of the ML Models (using Scikit-learn library)
6) Types of Learning (Supervised Learning Regression and Classification)
7) Special topic such as ( Deep Learning and neural networks, and transfer learning) using latest version of TensorFlow and Keras


What is Machine Learning
In simple words When computer learn from given input of data.

Machine learning is, using an algorithm or computer program to learn about different patterns in data, and then taking that algorithm and what it's learned to make predictions about the future using similar data machine learning algorithms are also called models.

Type of Machine Learning
1) Supervised Learning is done by drawing lines based on the given data. Example is regression
2) Unsupervised Learning when we have given data without Labels this is called unsupervised Learn eg; csv files without column names.In this we associated rules with set of given data.
3) Reinforcement is teaching Machine through trial and error through rewards and punishment

Steps in Data Models

1. Problem definition (What type of problem we have at hand)
2 Data (What type data we need to analyze)
3. Evaluation of Success and Failure
4. Features (feature we should model)
5. Model (what Kind of Model we should use to solve the problem)
6. Test Different combination we should apply 
Conclusion.
Keep doing the steps until we solve the problem using our model.

Flow of data science work

 A practical and usual data science workflow started with
1)      opening a csv file in  Jupiter notebook
2)      exploring the data and performing data analysis using pandas
3)      Making visualizations/ graphs and comparing different data points using map plot lib
4)      Building machine learning models on the data using Scikit-learn.
Key
 *Pandas   (a python library for data analysis)
*map plot lib (a python library for Visualization and graphs)
*Jupiter notebook (a tool for building machine learning projects)
*Scikit-learn also know as sklearn (free machine learning library for the Python)




Type of Data 

Unstructured data are things like images, mobile calls voice data, videos
Structured data like patient medical history of disease

Type of Machine Learning
1.       Supervised learning
2.       Unsupervised learning
3.       Transfer learning
4.       Reinforcement learning

Type of Evaluation
Classification
Regression
Recommendation
Accuracy
Mean Absolute error (MAE)
Precision at K
Precision
Mean Square error(MSE)

Recall
Root Mean Square error RMSE

 
Feature variables are used to predict the target variables
Feature variable are the attributes describing your data.





Variables in Data Science
Feature variables are used to predict the target variables
Feature variable are the attributes describing your data.
Kinds of feature variables
1.       Numerical feature (like body height number)
2.       Categorical feature (like gender Male and Female)
3.       Derived feature ( visit by a person to hospital in a year)
Important concept of ML
Split your data into 3 different sets 
1.      training dataset        to train your model
2.      validation data set    to choosing your model
3.      test data set               to test and compare your model with different models
split your data usually
70% for training data set
15 % for validation data set
15% for test data set


Three parts of modelling
1 choosing a model
2 training a model
3 Comparing a model
Choose a model
Working with structured data following model will work best
1.      decision trees
such as random forest /gradient boosting algorithms (cat boost in, x g boost)  
Working with unstructured data following model will work best
1.       deep learning neural networks
2.       transfer learning



Training of the model
By align your input (feature variables) and output (target variable) of the data
Feature variable input to the model  Ã Model (mode will find the patterns)àTarget variable as output
 input (feature variables) =X the data
is used to predict
output (target variable)= y

Improvement of your Model
 hyper parameters (choose different hyper parament for your model)
Fitting the model line on data point  is conceptualized as follows
1.       overfishing happens due to data leakage
2.       underfunding happen due to  data mismatch
3.       balanced(Goldilocks Zone)

Data leakage & Data Mismatch
Data leakage happens when some of your test data leaks into your training data
This often results in overfishing or a model doing better on the test set then on the training dataset
Data mismatch happens data used of testing is different to the data used for training that is
Training data is not equal (! =) testing data set
features in the training data set! = features in the testing data set.
Model Tuning
Training and Testing data set are to do Model Tuning
Training data set is used for
1.       Validation
2.       model tuning
Testing data set used for
1.      Testing
2.      Model comparison

Model comparison(comparing cars to cars)


To Build the Environment.

In anaconda prompt window(Miniconda3)
conda create --prefix ./env pandas numpy matplotlib scikit-learn
or I want to install the saperate jupyter
conda install jupyter
C:\Users\Tahir\Desktop\sample_pro\env>conda activate C:\Users\Tahir\Desktop\sample_project_1\env
after running the command prompt converts into following.
(C:\Users\Tahir\Desktop\sample_pro\env)

Note:-For Kernel error in jupyter notebook install as follows:-pip install jupyter_client --upgrade

Pandas  in Python for Data Analysis

import pandas as pd
names=pd.Series(["Tah","Zah","Mah"])
colour=pd.Series(["Red","green","Blue"])
df=pd.DataFrame({"k1":names,"k2":colour})
#import and read data  from csv using data frame
df.to_csv("abc.csv",index=False)
df2=pd.read_csv("abc.csv")
#attribues of DF
df.dtypes
df.columns
df.index
#function of DF
df.info()
df.describe()
df2["column_name"].sum()
df2.column_name
      

       Envivronment Keys:-
        Hit Esc then m , x b ,a in Jupyter Notebook
       Hit  shit+tab to see the description of  function
      Hit shit enter to execute the command in Jupyter Notebook
Hit Ctr+shit - to break the  lines in Jupyter Notebook

     

Data frames in Pandas

Column represent by axis=1
Row       represented by axis=0
index number starts at 0 by default.

pd.crosstab(df["col1"],df["col2"])
df["col1"].fillna(df["col1"].mean(),inplace=True)
df["col1"].dropna(inplace=True)
## how to create colum using series in python
new_column=pd.Series([7,7,7,7))

##how to create column from python  list

##should be the same length as  data frame df

capcity_column=[7.2,8.2,3.2,2.2]

df["An_other_new_Column"]=capcity_column

## To drop a colum using the function

df.drop("ColumnName", axis=1)

## to shuffule the data frame

       new_DF=df.smaple(frac=0.5) or frac=1 for 100%
## to remove the extra index colum using
df.reset_index(drop=True,inplace=True)

anonymous function.

apply a function to the column of the data frame

     df["columnName"]=df["columnName"].apply(lambda x: x/2)

this is say assign x=x/2 in function name lambda takes x then reassign x to x after dividing by 2

## ndarray in python of numpy package

       mysamplearray=np.ones()  # hit the key shift+tab to see the description of th function
mysamplearray=np.ones()
mysamplearray=np.zeros()
mysamplearray=np.arrange(0,10,2) #0=start, end=10 and 2= the step interval
mysamplearray=np.arrange(0,10,2)
mysamplearray=np.random.randint()
mysamplearray=np.random.rand(5,3) # 5 row and 3 column array
mysamplearray=np.random.seed()# Pseudo-random number.

##ploting using matplotlib in python


%matplotlib inline

import matplotlib.pylot as   mplt
import pandas as pd
import numpy as np

x=[1,2,3,4]
y=[11,22,33,44]
mplt.plot(x,y)
------------------

## Flow of matplotlib


# 1. import matplotlib and get it ready for  plotting in jupyter

     %matplotlib inline
import matplotlib.pyplot as plt

# 2 prepare data

      x=[1,2,3,4]
y=[11,22,33,44]

#3 setup plot

fig,ax=plt.subplots(figsize=(10,10)); # (width,hight)

# add data

   ax.plot(x,y)

# 4. customize plot

ax.set(title="SimplePlot", xlabel="x values", ylabel="y values")

# 5. save your figure

fig.savefig("images/simpleplot.png")



## Making figures using NumPy arrays

    #Line plot

    # Scatter plot

    # Bar Plot

    # Histogram


import numpy as np

#create some data

       x=np.linspace(0,10,100)
       x[:10]

#plot the data and create a line plot

       fig,ax=plt.subplots()
       ax.plot(x,x**2);

 
       fig,ax=plt.subplots()
ax.scatter(x,np.sin(x))


# making a plot from dictionary


    prices={"butter":10, "Cake":20, "Milk":50}
fig,ax=plt.subplots()
ax.bar(prices.keys(),prices.values())
ax.set(title='butter Cake Milk Shop',ylabel="prices")


#How to Plot our data frame using Pandas

      import pandas as pd

#make a data frame

      cs=pd.read_csv("CS.csv")
cs["Price"]=cs["Price"].str.replace('[\$\,\.]','')
cs["Price"]=cs["Price"].str[:-2]

#adding date column in our dataframe

     cs["S_date"]=pd.date_range("1/1/2020",periods=len(cs))

cs["Total_S"]=cs["Price"].astype(int).cumsum()

# now plot the s_date and Total_S

cs.plot(x="S_date",y="Total_S");














# Plot a bar graph
x=np.random.rand(10,4)
# Now Make  a data frame of these random data.
df=pd.DataFram(x,column=['a','b','c','d']

df.plot.bar()

cs[].plot(x="Make",y="Meter",kind="bar")

Histogram?

cs["Price"].plot.hist()





cs["Price"].astype(int)
cs.plot(x="Odm)",y="Pr", kind="scatter");

#Using subplots making all column plot in one graph.

      cs.head()
cs.plot.hist(figsize=(10,30),subplots=True);


#Object Oriented  Method

fix,ax=plt.subplots(figsize=(10,6))
of.plot(kind='scatter',x='age','high',c='tgt',ax=ax);
ax.set_xlim([50,100])

## OO Method from the base

Fig,ax=plt.subplots(figsize=(10,6))

#Plot the data
scatter=ax.scatter(x=df["age"],
                                    y=df["ch"],
                                     c=df["trgt"])

#Customize the plot

ax.set(
          title="HDD and LL" ,
         xlabel="Age"
          ylabel="ch")

# Add a legend

ax.legend("scatter.legend_elements(),title="trgt");                                     

#Add a horixontal line

ax.axhline(df["ch"].mean(),linestyle='--')

# subplot of  age, hight, weight

fig,(ax0,ax1)=plt.subplots(nrow=2,ncols=1,figsize=(10,10))



SK-Lean or Scikit-Lear is Machine Learning Library of Python 

Work Flow of Machine Learning Models

Getting data ready (to be used with ML Models)

Choosing a machine learning Models

Fitting a model to the data

Making prediction with a model

Evaluating model predictions

Improving model prediction

saving and loading models

Example for this

# Standard imports
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

#1. Get the data ready
###################################################################
# Import dataset
bad_disease = pd.read_csv("../data/bad-disease.csv")
# View the data
bad_disease.head()
####################################################################
# Create X (all the feature columns)
X = bad_disease.drop("target", axis=1)
# Create y (the target column)
y = bad_disease["target"]
####################################################################
# Split the data into training and test sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y)
# View the data shapes
X_train.shape, X_test.shape, y_train.shape, y_test.shape

#2. Choose the model/estimator
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()

#3. Fit the model to the data and use it to make a prediction
#A model will (attempt to) learn the patterns in a dataset by calling the fit() function on it and passing it the data.
model.fit(X_train, y_train)

#Once a model has learned patterns in data, you can use them to make a prediction with the predict() function.
# Make predictions
y_preds = model.predict(X_test)

# This will be in the same format as y_test
y_preds

X_test.loc[209]
bad_disease.loc[209]

# Make a prediction on a single sample (has to be array)
model.predict(np.array(X_test.loc[209]).reshape(1, -1))
#array([0])


4. Evaluate the model
#A trained model/estimator can be evaluated by calling the score() function and passing it a collection of data.

# On the training set
model.score(X_train, y_train)

# On the test set (unseen)
model.score(X_test, y_test)

#5. Experiment to improve (hyperparameter tuning)
#A model's first evaluation metrics aren't always its last. One way to improve a models predictions is with hyperparameter tuning.

# Try different numbers of estimators (n_estimators is a hyperparameter you can change)
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    model = RandomForestClassifier(n_estimators=i).fit(X_train, y_train)
    print(f"Model accruacy on test set: {model.score(X_test, y_test)}")
    print("")

#6. Save a model for later use
#A trained model can be exported and saved so it can be imported and used later. One way to save a model is using Python's pickle module.

import pickle
# Save trained model to file
pickle.dump(model, open("random_forest_model_1.pkl",

# Load a saved model and make a prediction on a single example
loaded_model = pickle.load(open("random_forest_model_1.pkl", "rb"))
loaded_model.predict(np.array(X_test.loc[209]).reshape(1, -1))

Another Way of the Example


--We fit the model on the training data and train the model on testing data
--Classification model( random forest)  find the patters in the training data 
-- Make prediction on test data

import pandas as pd
hd=pd.read_csv("data/baddisease.csv")
#create x the feature columns
X=hd.drop("target",axis=1)
#create y the label
y=hd["target"]
#Choose the right  model and hyperparameter
# RandomForestClassifier is a classifcation Machnine Learning Model
from sklearn.ensemble import RandomForestClassifier
clf=RandomForestClassifier(n_estimators=100)
# we will keep the default hyperparameter
clf.get_params()
# 3. Fit the model to the training data
from sklearn.model_selection import  train_test_split
X_train, X_test, y_train, y_test= train_test_split(X, y,test_size=0.2)
clf.fit(X_train,y_train) -- Fitting the model to the training data
y_preds=clf.prdict(X_test) -- X_test making prediction on test data
y_preds
#4 Evaluate the model on the training data and testing data.
clf.score(X_train,y_train)
clf.score(X_test,y_test)