#import libararies
import requests
import urllib.request
import time
from bs4 import BeautifulSoup
import pandas as pd
import numpy as np
import urllib.parse
from urllib.parse import urljoin
import re

Save the base links of TripAdvisor and create lists to store links later

urlmain = "https://www.tripadvisor.com/Hotels-g191-United_States-Hotels.html" # main page of U.S hotels 
base_url = "https://www.tripadvisor.com" # base link of the TripAdvisor website

list_local = [] # store url for the first pages of top 100 U.S cities of hotel services
full_list_local = [] # store url for first 5 pages of top 100 cities # 500 pages in the total
list_hotel = [] # store url for hotels in these 500 pages

Retrieve the hotel links

Step 1: Scrape 100 location links

I will scrape the links of 100 locations (5 pages, 20 locations/page).
Due to the different structure of the first page, I have to write a separate code for it.
For other pages, I will use the for loop to retrieve the links.
The 100 location links will be stored in the list_local.

# retrieve the links in the first page and store them in list_local
html = requests.get(urlmain)
soup = BeautifulSoup(html.content, 'lxml')
for item in soup.find_all("a", attrs={"class":"linkText"}):
    full_loc_url = []
    loc_url = item.get("href") # get all the href containing the links for the location
    full_loc_url = urljoin(base_url,loc_url) # join the href links with the base link of TripAdvisor
    list_local.append(full_loc_url) # store the links 

# retrieve the links in the next 4 pages
urlhead = "https://www.tripadvisor.com/Hotels-g191-oa" # separate the links into two parts
urltail = "-United_States-Hotels.html#LEAF_GEO_LIST"   # the number between two urls will determine the page
for i in range (20,100,20): # i receives the values: 20,40,60,80,... corresponding to pages: 2,3,4,5,...
    url = urlhead + str(i) + urltail # the url of the page I try to scrape
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    for item in soup.find_all("a", attrs={"class":"city"}):
        full_loc_url = []
        loc_url = item.get("href")
        full_loc_url = urljoin(base_url,loc_url)
        list_local.append(full_loc_url) # store the links in the list_local

len(list_local) # check if I get all the 100 links
#list_local[:10] 

Step 2: Retrieve the links for the first 5 pages of each location

I got the 100 location links but the links represent their first pages only.
For each location, I want to hotel links in the first 5 pages. There will be 500 pages in the total.
The code below will retrieve 500 pages and store them in full_list_local.

split_char = "-" # split the links at the "-" character
                # the variable defines the page number will be at the 2nd "-" position

for url in list_local:
    temp = url.split(split_char)
    for i in range (0,150,30):
        # join the link again with the variable defining the page number in the middle
        page_url = split_char.join(temp[:2])+ "-oa" + str(i) + "-" + split_char.join(temp[2:]) 
        full_list_local.append(page_url)
    


   

len(full_list_local) # check if I get all 500 pages
#full_list_local[:10]

Step 3: Scrape the hotel links

I will write a code to scrape all the hotel links from 500 pages (~ 0 hotels per page).
Theoretically, I would have around 15,000 hotels but actually there are many locations with under 100 hotels.
The real number will be smaller than 15,000. I expect to have 10,000-13,000 hotels for my dataset.

for url in full_list_local:
    html = requests.get(url)
    soup = BeautifulSoup(html.content, 'lxml')
    div = soup.find_all("div", attrs={"class":"listing_title"}) # find all the div(s) that contain the href for hotels
    
    for item in div:
        for i in item.find_all("a"):
            hotel_url = []
            href_url = i.get("href") # get the href(s)
            hotel_url = urljoin(base_url,href_url) # join href(s) with the base link
            list_hotel.append(hotel_url) # save the hotel links

hotel_html = pd.Series(list_hotel).drop_duplicates().tolist() # drop duplicates and save them in hotel_html

len(hotel_html) # check how many hotels I will have for my dataset

Scrape the information from hotel pages

There are 9 attributes I want to scrape in each hotel page:

The hotel name.
The location/city of the hotel.
Cost to stay at the hotel per night (excluding taxes and other fees).
TripAdvisor Score for the hotel.
Ratings for the hotel (excellent, good, bad,…).
Walk grade (0-100): shows how convenient travelers feel when moving to places near the hotel.
Number of nearby restaurants.
Number of nearby attractions.
State code of the hotel.

# create 9 empty lists to store the data
# Some values are missing so I use the try-except here. Missing values are recorded as NAs.
name = []
location = []
price = []
score = []
walk = []
restaurant = []
attraction = []
label = []
state = []

for url in hotel_html:
    link = requests.get(url)
    soup = BeautifulSoup(link.content, 'lxml')
    
    #scrape the name
    try:
        nm = soup.find("h1", attrs={"class":"hotels-hotel-review-atf-info-parts-Heading__heading--2ZOcD"})
        name.append(nm.text)
    except:
        name.append("NA")
    
    #scrape the location
    try:
        loc = soup.find("a", attrs={"data-tracking-label":"tourism"})
        location.append(loc.text)
    except:
        location.append("NA")
    
    #scrape the price
    try:
        pc = soup.find("div", attrs={"data-sizegroup":"hr_chevron_prices"})
        price.append(pc.text)
    except:
        try:
            pc = soup.find("div", attrs={"class":"hotels-hotel-offers-DominantOffer__price--D-ycN"})
            price.append(pc.text)
        except:
            price.append("NA")
    
    #scrape the score
    try:
        scr = soup.find("span", attrs={"class":"hotels-hotel-review-about-with-photos-Reviews__overallRating--vElGA"})
        score.append(scr.text)
    except:
        score.append("NA")
    
    #scrape the ratings
    try:
        lbl = soup.find("div", attrs={"class":"hotels-hotel-review-about-with-photos-Reviews__ratingLabel--24XY2"})
        label.append(lbl.text)
    except:
        label.append("NA")
    
    #scrape the walk grade
    try:
        wlk = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__green--3lccI"})
        walk.append(wlk.text)
    except:
        walk.append("NA")
    
    #scrape the no. of restaurants
    try:
        resr = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__orange--1N-BP"})
        restaurant.append(resr.text)
    except:
        restaurant.append("NA")
    
    #scrape the no. of attractions
    try:
        attr = soup.find("span", attrs={"class":"hotels-hotel-review-location-layout-Highlight__number--S3wsZ hotels-hotel-review-location-layout-Highlight__blue--2qc3K"})
        attraction.append(attr.text)
    except:
        attraction.append("NA")
        
    #scrape the state code
    try:
        st = []
        for item in soup.find_all("li" , attrs={"class":"breadcrumb"}): #this will scrape the country, location,...
            st.append(item.text)
        st1 = st[1] # the 2nd item will contain the state name and its code
        st2=re.search('\(([^)]+)', st1).group(1) #get the word within the first parenthesis
        state.append(st2)
    except:
        state.append("NA")
   
    

Save the scrapped data in a CSV file

df1 = pd.DataFrame(name).T
df2 = pd.DataFrame(location).T
df3 = pd.DataFrame(price).T
df4 = pd.DataFrame(score).T
df5 = pd.DataFrame(label).T
df6 = pd.DataFrame(walk).T
df7 = pd.DataFrame(restaurant).T
df8 = pd.DataFrame(attraction).T
df9 = pd.DataFrame(state).T
df = pd.concat((df1,df2,df9,df3,df4,df5,df6,df7,df8)).T # merge the individual data into a dataframe
df.columns = ("Hotel","Location","Code","Cost","Score","Rating","Walk.Grade","No. Restaurants","No. Attractions")
df.drop_duplicates(inplace=True) # drop duplicates, fortunately, I have no duplicates after the scraping

df # show my dataframe

	Hotel	Location	Code	Cost	Score	Rating	Walk.Grade	No. Restaurants	No. Attractions
0	Baccarat Hotel & Residences New York	New York City	NY	$1,045	4.5	Excellent	100	451	119
1	Crowne Plaza Times Square Manhattan	New York City	NY	$229	4.0	Very good	100	551	246
2	Park Lane Hotel	New York City	NY	$180	4.0	Very good	100	263	90
3	Martinique New York on Broadway, Curio Collect...	New York City	NY	$191	4.0	Very good	100	547	104
4	Arlo NoMad	NA	NY	$215	4.5	Excellent	100	511	89
...	...	...	...	...	...	...	...	...	...
12308	Beach Walk Oceanfront Inn	Old Orchard Beach	ME	NA	3.5	Very good	71	50	9
12309	Moby Dick Motel	Old Orchard Beach	ME	NA	3.5	Very good	87	57	12
12310	Sir Charles Motel	Old Orchard Beach	ME	NA	3.5	Very good	83	56	11
12311	Sunset Motel	Old Orchard Beach	ME	NA	3.0	Average	100	47	11
12312	Seafarer Inn and Cottages	Old Orchard Beach	ME	$125	2.0	Poor	62	12	1

12313 rows × 9 columns

df.to_csv("TripAd-U.S_Hotels.csv", sep='\t') # save it in a CSV file

TripAdvisor_Hotel_Analysis.github.io

U.S Hotels Analysis (data scraped from TripAdvisor)

Save the base links of TripAdvisor and create lists to store links later

Retrieve the hotel links

Step 1: Scrape 100 location links

Step 2: Retrieve the links for the first 5 pages of each location

Step 3: Scrape the hotel links

Scrape the information from hotel pages

Save the scrapped data in a CSV file