Web scraping using python script
In [3]: from bs4 import BeautifulSoup
import requests
In [12]: url = "https://en.wikipedia.org/wiki/List_of_largest_companies_by_revenue"
In [211…
pages = requests.get(url)
print(pages)
In [96]: soup = BeautifulSoup(pages.text,'html')
In [210…
print(soup.prettify())
Extracting column details
In [212…
our_table = soup.find_all('table')[0]
print(our_table)
In [213…
col_names = our_table.find('tr')
print(col_names)
In [214…
col_name = col_names.find_all('th')
print(col_name)
In [136…
column_names = [col.text.strip() for col in col_name]
print(column_names)
['Rank', 'Name', 'Industry', 'Revenue', 'Profit', 'Employees', 'Headquarters[note 1]', 'State-owned', 'Ref.', 'Revenue per worker']
In [ ]:
In [ ]:
setting our DataFrame
In [121…
import pandas as pd
In [203…
df = pd.DataFrame(columns = column_names)
df
Rank
Out[203]:
Name
Industry
Revenue
Profit
Employees
Headquarters[note 1]
State-owned
Ref.
Revenue per worker
Scraping for Data to fill our df.columns()
In [204…
df.drop('Rank', axis= 1, inplace= True)
df
Name
Out[204]:
Industry
Revenue
Profit
Employees
#will add Rank later as it has diff input
Headquarters[note 1]
In [215…
our_data = our_table.find_all('tr')[2:]
print(our_data)
In [205…
for trs in our_data:
raw_data = trs.find_all('td')
data_names = [tds.text.strip() for tds in raw_data]
length= len(df)
df.loc[length] = data_names
In [206…
df.shape
Out[206]:
State-owned
Ref.
Revenue per worker
(50, 9)
In [190…
# Getting the Ranks
In [216…
rank_data = our_table.find_all('th')[11:]
print(rank_data)
In [197…
data_list = [dat.text.strip() for dat in rank_data]
print(data_list)
['1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', '20', '21', '22', '23', '24', '25', '26', '27', '28', '29', '30', '31', '3
2', '33', '34', '35', '36', '37', '38', '39', '40', '41', '42', '43', '44', '45', '46', '47', '48', '49', '50']
In [217…
df.insert(0, 'Rank', data_list)
df.head()
Rank
Name
Industry
Revenue
Profit
Employees
Headquarters[note 1]
0
1
Walmart
Retail
$611,289
$11,680
2,100,000
1
2
Saudi Aramco
Oil and gas $603,651
$159,069
2
3
Amazon
Retail
$574,785
3
4
State Grid Corporation of China
Electricity
4
5
Vitol
Commodities
Out[217]:
Ref.
Revenue per worker
United States
[1]
$291,090.00
70,496
Saudi Arabia
[4]
$8,562,911.37
$30,425
1,525,000
United States
[5]
$376,908
$530,009
$8,192
870,287
China
[6]
$609,004.85
$505,000
$15,000
1,560
Switzerland
[7][8]
$323,717,948.72
Exporting our scrapped data as a .csv file
In [209…
In [ ]:
df.to_csv('scrapped DataFrame.csv', index= False)
State-owned