View on GitHub

TripAdvisor_Hotel_Analysis.github.io

U.S Hotels Analysis (data scraped from TripAdvisor)

#import libararies
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.parse
from urllib.parse import urljoin
import re

Save the base links of TripAdvisor and create lists to store links later

urlmain = "https://www.tripadvisor.com/Hotels-g191-United_States-Hotels.html" # main page of U.S hotels 
base_url = "https://www.tripadvisor.com" # base link of the TripAdvisor website

list_local = [] # store url for the first pages of top 100 U.S cities of hotel services
full_list_local = [] # store url for first 5 pages of top 100 cities # 500 pages in the total
list_hotel = [] # store url for hotels in these 500 pages

Retrieve the hotel links

# retrieve the links in the first page and store them in list_local
html = requests.get(urlmain)
soup = BeautifulSoup(html.content, 'lxml')
for item in soup.find_all("a", attrs={"class":"linkText"}):
    full_loc_url = []
    loc_url = item.get("href") # get all the href containing the links for the location
    full_loc_url = urljoin(base_url,loc_url) # join the href links with the base link of TripAdvisor
    list_local.append(full_loc_url) # store the links 

# retrieve the links in the next 4 pages
urlhead = "https://www.tripadvisor.com/Hotels-g191-oa" # separate the links into two parts
urltail = "-United_States-Hotels.html#LEAF_GEO_LIST"   # the number between two urls will determine the page
for i in range (20,100,20): # i receives the values: 20,40,60,80,... corresponding to pages: 2,3,4,5,...
    url = urlhead + str(i) + urltail # the url of the page I try to scrape
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    for item in soup.find_all("a", attrs={"class":"city"}):
        full_loc_url = []
        loc_url = item.get("href")
        full_loc_url = urljoin(base_url,loc_url)
        list_local.append(full_loc_url) # store the links in the list_local

len(list_local) # check if I get all the 100 links
#list_local[:10] 
100
split_char = "-" # split the links at the "-" character
                # the variable defines the page number will be at the 2nd "-" position

for url in list_local:
    temp = url.split(split_char)
    for i in range (0,150,30):
        # join the link again with the variable defining the page number in the middle
        page_url = split_char.join(temp[:2])+ "-oa" + str(i) + "-" + split_char.join(temp[2:]) 
        full_list_local.append(page_url)
    


   

len(full_list_local) # check if I get all 500 pages
#full_list_local[:10]
500
for url in full_list_local:
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    div = soup.find_all("div", attrs={"class":"listing_title"}) # find all the div(s) that contain the href for hotels
    
    for item in div:
        for i in item.find_all("a"):
            hotel_url = []
            href_url = i.get("href") # get the href(s)
            hotel_url = urljoin(base_url,href_url) # join href(s) with the base link
            list_hotel.append(hotel_url) # save the hotel links
hotel_html = pd.Series(list_hotel).drop_duplicates().tolist() # drop duplicates and save them in hotel_html
len(hotel_html) # check how many hotels I will have for my dataset
12313

Scrape the information from hotel pages

There are 9 attributes I want to scrape in each hotel page:

# create 9 empty lists to store the data
# Some values are missing so I use the try-except here. Missing values are recorded as NAs.
name = []
location = []
price = []
score = []
walk = []
restaurant = []
attraction = []
label = []
state = []

for url in hotel_html:
    link = requests.get(url)
    soup = BeautifulSoup(link.content, 'lxml')
    
    #scrape the name
    try:
        nm = soup.find("h1", attrs={"class":"hotels-hotel-review-atf-info-parts-Heading__heading--2ZOcD"})
        name.append(nm.text)
    except:
        name.append("NA")
    
    #scrape the location
    try:
        loc = soup.find("a", attrs={"data-tracking-label":"tourism"})
        location.append(loc.text)
    except:
        location.append("NA")
    
    #scrape the price
    try:
        pc = soup.find("div", attrs={"data-sizegroup":"hr_chevron_prices"})
        price.append(pc.text)
    except:
        try:
            pc = soup.find("div", attrs={"class":"hotels-hotel-offers-DominantOffer__price--D-ycN"})
            price.append(pc.text)
        except:
            price.append("NA")
    
    #scrape the score
    try:
        scr = soup.find("span", attrs={"class":"hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA"})
        score.append(scr.text)
    except:
        score.append("NA")
    
    #scrape the ratings
    try:
        lbl = soup.find("div", attrs={"class":"hotels-hotel-review-about-with-photos-Reviews__ratingLabel--24XY2"})
        label.append(lbl.text)
    except:
        label.append("NA")
    
    #scrape the walk grade
    try:
        wlk = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__green--3lccI"})
        walk.append(wlk.text)
    except:
        walk.append("NA")
    
    #scrape the no. of restaurants
    try:
        resr = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__orange--1N-BP"})
        restaurant.append(resr.text)
    except:
        restaurant.append("NA")
    
    #scrape the no. of attractions
    try:
        attr = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__blue--2qc3K"})
        attraction.append(attr.text)
    except:
        attraction.append("NA")
        
    #scrape the state code
    try:
        st = []
        for item in soup.find_all("li" , attrs={"class":"breadcrumb"}): #this will scrape the country, location,...
            st.append(item.text)
        st1 = st[1] # the 2nd item will contain the state name and its code
        st2=re.search('\(([^)]+)', st1).group(1) #get the word within the first parenthesis
        state.append(st2)
    except:
        state.append("NA")
   
    

Save the scrapped data in a CSV file

df1 = pd.DataFrame(name).T
df2 = pd.DataFrame(location).T
df3 = pd.DataFrame(price).T
df4 = pd.DataFrame(score).T
df5 = pd.DataFrame(label).T
df6 = pd.DataFrame(walk).T
df7 = pd.DataFrame(restaurant).T
df8 = pd.DataFrame(attraction).T
df9 = pd.DataFrame(state).T
df = pd.concat((df1,df2,df9,df3,df4,df5,df6,df7,df8)).T # merge the individual data into a dataframe
df.columns = ("Hotel","Location","Code","Cost","Score","Rating","Walk.Grade","No. Restaurants","No. Attractions")
df.drop_duplicates(inplace=True) # drop duplicates, fortunately, I have no duplicates after the scraping


df # show my dataframe
Hotel Location Code Cost Score Rating Walk.Grade No. Restaurants No. Attractions
0 Baccarat Hotel & Residences New York New York City NY $1,045 4.5 Excellent 100 451 119
1 Crowne Plaza Times Square Manhattan New York City NY $229 4.0 Very good 100 551 246
2 Park Lane Hotel New York City NY $180 4.0 Very good 100 263 90
3 Martinique New York on Broadway, Curio Collect... New York City NY $191 4.0 Very good 100 547 104
4 Arlo NoMad NA NY $215 4.5 Excellent 100 511 89
... ... ... ... ... ... ... ... ... ...
12308 Beach Walk Oceanfront Inn Old Orchard Beach ME NA 3.5 Very good 71 50 9
12309 Moby Dick Motel Old Orchard Beach ME NA 3.5 Very good 87 57 12
12310 Sir Charles Motel Old Orchard Beach ME NA 3.5 Very good 83 56 11
12311 Sunset Motel Old Orchard Beach ME NA 3.0 Average 100 47 11
12312 Seafarer Inn and Cottages Old Orchard Beach ME $125 2.0 Poor 62 12 1

12313 rows × 9 columns

df.to_csv("TripAd-U.S_Hotels.csv", sep='\t') # save it in a CSV file