Beautiful Soup implementation
import pandas as pd
import requests
from bs4 import BeautifulSoup
import random
import mysql.connector
#url = "http://somewebsite?page=1"
# Create a list in a range of 10-20
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="password",
database="dbname"
)
my_list = [*range(1, 4, 1)]
for iurl in my_list:
# ("Enabling special output mode (% s)") % (currentValue)
url = "http://somewebsite?page={}".format(iurl)
# Create a list in a range of 10-20
#my_list = [*range(1, 300, 1)]
# Print the list
# print(url)
user_agents = [
"Mozilla/5.0 (Windows NT 10.0; rv:91.0) Gecko/20100101 Firefox/91.0",
"Mozilla/5.0 (Windows NT 10.0; rv:78.0) Gecko/20100101 Firefox/78.0",
"Mozilla/5.0 (X11; Linux x86_64; rv:95.0) Gecko/20100101 Firefox/95.0"
]
random_user_agent = random.choice(user_agents)
headers = {
'User-Agent': random_user_agent
}
webpage=requests.get(url,headers=headers).text
soup=BeautifulSoup(webpage,'lxml')
mycursor = mydb.cursor()
data=soup.body.find_all('table')[2].find_all('tr')
for i in data:
if i.text.strip() == data[0].text.strip() or i.text.strip() == data[1].text.strip() or i.text.strip() == data[2].text.strip() or i.text.strip() == data[3].text.strip()or i.text.strip() == data[4].text.strip():
continue
pnumber=i.find_all('td')[0].text.strip()
name=i.find_all('td')[1].text.strip()
dob=i.find_all('td')[2].text.strip()
doj=i.find_all('td')[3].text.strip()
prank=i.find_all('td')[4].text.strip()
punit=i.find_all('td')[5].text.strip()
sunit=i.find_all('td')[6].text.strip()
hdistrict=i.find_all('td')[7].text.strip()
sql = "INSERT INTO uppnrs (pnumber,name,dob,doj,prank,punit,sunit,hdistrict) VALUES (%s, %s,%s, %s,%s, %s,%s,%s)"
val = (pnumber, name, dob, doj, prank, punit, sunit, hdistrict)
mycursor.execute(sql, val)
mydb.commit()
if i.text.strip() == data[29].text.strip():
break
#d8=i.find_all('td')[8].text
#print(pnumber)
#print(name)
#print(dob)
#print(d3)
#print(d4)
#print(d5)
#print(d6)
#print(d7)
#print(d8)
print(url+' done')
# Print the list
print('done')
*********************************************
to excel from database
import mysql.connector
import pandas as pd
import numpy as np
mydb = mysql.connector.connect(
host="localhost",
user="root",
password="password",
database="dbname"
)
mycursor = mydb.cursor()
mycursor.execute("SELECT * FROM tablename")
myresult = mycursor.fetchall()
df = pd.DataFrame(myresult)
#df
df.to_excel('data.xlsx')
#for x in myresult:
# print(x)
Comments
Post a Comment