## Version 3.3 with Fewer Functions
* Added function `getTable` takes a modelSoup entry, trims it down to just the specs table, and then makes a new list (table) with each specification panel (excluding those for entertainment, exterior, interior, mechancical, and safety, which get parsed differently)
* Eliminated use of other functions for isolating specs table categories and scraping data.  Instead, all 'tr' entries from the table (excluding those from the last five spec categories) are scraped
* Changed the scraping methods for entertainment, exterior, interior, mechanical, and safety to look for the section heading from the table rather than just relying on the panel number
* Added a 'try' condition in the main stats scraping block.  If a vehicle does not have a specs table, an error will be thrown in the `getTable` function and the loop will go to the next vehicle.  Eliminates any vehicles without stats from the final table.

Please Note: this version does not have the functionality to replace missing values with "N/A" as in the other 3.3 version.  I believe the compilation of all features will be more useful, however, because it will allow us to bypass our personal bias of trying to decide what features to use.  We will instead be able to see the frequency of data in each category and eliminate those that are less frequently used rather than those that "we don't think are important".

**I tested with trims from the cadillac escalade and the land rover defender, change indexing if you want to run more**


In [4]:
import pandas as pd
import numpy as np
import imageio
import matplotlib.pyplot as plt
import bs4
import requests
import time

#!pip install fuzzywuzzy
import fuzzywuzzy
from fuzzywuzzy import fuzz
from fuzzywuzzy import process

https://www.cars.com/research/search/?page=1&page_size=40&years%5B%5D=2021
hard code in num of pages, iterate through cars on each page to get links (have to add /trims to links)

In [5]:
allModelTypes = []

for pageNum in range(1,12):
  pageURL = "https://www.cars.com/research/search/?page=" + str(pageNum) + "&page_size=40&years[]=2021"
  pageResponse = requests.get(pageURL)
  time.sleep(1)
  pageSoup = bs4.BeautifulSoup(pageResponse.text, 'html.parser')
  pageInfo = pageSoup.find_all('a', class_='sds-button--fluid')

  for pages in pageInfo:
    allModelTypes.append(pages.get("href"))

allModelTypes = [pages for pages in allModelTypes if 'research' in pages]

In [6]:
URL="https://www.cars.com"
carNames=["/research/cadillac-escalade-2021","/research/land_rover-defender-2021"]
carTrims=[]

#CHECKED JUST A FEW CARS SWITCH BACK TO ALLMODELTYPES IF YOU WANT THE FULL THING
for car in allModelTypes:
  trims=[]
  r=requests.get(URL+car+"/trims/")
  carsoup = bs4.BeautifulSoup(r.text, 'html.parser')
  time.sleep(1)
  table= carsoup.find_all("td")
  for x in table:
    try:
      trims.append(x.find_all("a"))
    except:
      continue
  for trim in trims:
    if trim != []:
      carTrims.append(trim[0].get("href"))

In [None]:
modelSoup= []

for car in carTrims:
  r=requests.get(URL+car)
  modelSoup.append(bs4.BeautifulSoup(r.text, 'html.parser'))
  time.sleep(1)

In [None]:
def makeModelYear(df,car):
    cFeatures=car.find('section', class_='sds-page-section sds-page-section--header')
    try:
        cFeatDet=cFeatures.find_all('a')
        for feature in cFeatDet:
            if feature.get('data-linkname')=="research-make":
                df["Make"]=[feature.text]
            if feature.get('data-linkname')=="research-make-model":
                df["Model"]=[feature.text]
            if feature.get('data-linkname')=="research-mmy":
                df["Year"]=[feature.text]
    except:
        df["Make"]=["N/A"]
        df["Model"]=["N/A"]
        df["Year"]=["N/A"]

In [None]:
def trim(df,car):
    tFeatures=car.find('section',class_='sds-page-section sds-page-section--header').find('ul',class_='sds-breadcrumb sds-breadcrumb--mobile-custom')
    try:
        df["Trim"]=[tFeatures.find_all('li')[5].text]
    except:
        df["Trim"]=["N/A"]

In [None]:
def vehicleType(df,car):
    try:
        df["Vehicle Type"]=[car.find('h2', class_='primary-body-type-label').text] #vehicle type
    except:
        df["Vehicle Type"]=["N/A"]

In [None]:
def startingMSRP(df,car):
    try:
        df["Starting MSRP"]=[car.find('div', class_='price-amount').text] #MSRP
    except:
        df["Starting MSRP"]=["N/A"]

In [None]:
def keyFeatures(df,car):
    kFeatures=car.find('div',class_="key-specs-container") #key specs
    try:
        kFeatDet = kFeatures.find_all('div', class_="key-spec")
        for feature in kFeatDet:
            standard = feature.find('label')
            if fuzz.partial_ratio('door',standard.text)==100:
                df["Doors"]=[standard.text]
            if fuzz.partial_ratio('seat',standard.text)==100:
                df["Seats"]=[standard.text]
    except:
        df["Doors"]=["N/A"]
        df["Seats"]=["N/A"]

In [None]:
def extColor(df,car):
    colorList=[]
    cFeatures = car.find('div', class_="color-options") #color options
    try:
        cFeatDet = cFeatures.find_all('div', class_="color-box")
        for feature in cFeatDet:
            colorList.append(feature.get("data-color-name")+"^^")
        df["Exterior Color"]=[colorList]
    except:
        df["Exterior Color"]=["N/A"]

In [None]:
def intColor(df,car):
    colorList=[]
    cFeatures = car.find('div',class_="interior-color-container")
    try:
        cFeatDet = cFeatures.find_all('p')
        for feature in cFeatDet:
            colorList.append(feature.text+"^^")
        df["Interior Color"]=[colorList[1:]]
    except:
        df["Interior Color"]=["N/A"]

In [None]:
def entertainment(df,car):
    try:
      section=car.find("div",class_="sds-accordion")
      heading = section.find("span",text="Entertainment")
      featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
      featList=[]
      for feature in featDet:
        standard = feature.find("strong")
        if(len(standard) > 100):
          standard = standard[:100]
        featList.append(standard.text+"^^")
      df["Entertainment Features"]=[featList]
    except:
      df["Entertainment Features"]=["N/A"]

In [None]:
def exterior(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Exterior")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList=[]
        for feature in featDet:
          standard = feature.find("strong")
          if(len(standard) > 100):
            standard = standard[:100]
          featList.append(standard.text+"^^")
        df["Exterior Features"]=[featList]
    except:
        df["Exterior Features"]=["N/A"]

In [None]:
def interior(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Interior")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList = []
        for feature in featDet:
          standard = feature.find("strong")
          if(len(standard) > 100):
            standard = standard[:100]
          featList.append(standard.text+"^^")
        df["Interior Features"]=[featList]
    except:
        df["Interior Features"]=["N/A"]

In [None]:
def mechanical(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Mechanical")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList=[]
        for feature in featDet:
            standard = feature.find("strong")
            if(len(standard) > 100):
              standard = standard[:100]
            featList.append(standard.text+"^^")
        df["Mechanical Features"]=[featList]
    except:
        df["Mechanical Features"]=["N/A"]

In [None]:
def package(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Package")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList=[]
        for feature in featDet:
            standard = feature.find("strong")
            if(len(standard) > 100):
              standard = standard[:100]
            featList.append(standard.text+"^^")
        df["Package"]=[featList]
    except:
        df["Package"]=["N/A"]

In [None]:
def processing(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Processing-Other")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList=[]
        for feature in featDet:
            standard = feature.find("strong")
            if(len(standard) > 100):
              standard = standard[:100]
            featList.append(standard.text+"^^")
        df["Processing-Other"]=[featList]
    except:
        df["Processing-Other"]=["N/A"]

In [None]:
def safety(df,car):
    try:
        section=car.find("div",class_="sds-accordion")
        heading = section.find("span",text="Safety")
        featDet = heading.parent.parent.next_sibling.next_sibling.find_all("tr")
        featList=[]
        for feature in featDet:
            standard = feature.find("strong")
            if(len(standard) > 100):
                standard = standard[:100]
            featList.append(standard.text+"^^")
        df["Safety Features"]=[featList]
    except:
        df["Safety Features"]=["N/A"]

In [None]:
def getTable(car):
    table=car.find("div",class_="sds-accordion")
    categories=[]
    sections = table.find_all("h3")
    for element in sections:
        name=element.find("span").text
        if(name!=("Entertainment"))&(name!=("Exterior"))&(name!="Processing-Other")&(name!=("Interior"))&(name!=("Mechanical"))&(name!="Package")&(name!=("Safety")):
            categories.append(element.next_sibling.next_sibling)
    return(categories)

In [None]:
frame=pd.DataFrame()
categories=[]
for car in modelSoup:
    df=pd.DataFrame()
    
    makeModelYear(df,car)
    trim(df,car)
    vehicleType(df,car)
    startingMSRP(df,car)
    keyFeatures(df,car)
    extColor(df,car)
    intColor(df,car)
    
    try:
        table=getTable(car)
        
        for category in table:
            featDet=category.find_all("tr")
            for feature in featDet:
                standard=feature.find("strong")
                df[standard.text]=[feature.find("td","row-content").text]
        
        entertainment(df,car)
        exterior(df,car)
        interior(df,car)
        mechanical(df,car)
        package(df,car)
        processing(df,car)
        safety(df,car)
        
        frame=frame.append(df)
    except:
        continue



In [None]:
frame.info(verbose=True, null_counts=True)

In [None]:
frame.to_csv('car_data.csv')