Documentation

Documentation

And see it for yourself !

IMPORTANT : If you are testing on a retail site, do not paste the URL of the homepage or a page with multiple products, you have to paste a single product page URL 😉

Getting started

API endpoints :

POST  http://api.scraping-bot.io/scrape/raw-html  ➡️ Read documentation here

POST  http://api.scraping-bot.io/scrape/retail  ➡️  Read documentation here

POST  http://api.scraping-bot.io/scrape/real-estate  ➡️  Read documentation here

 


 

Hey ! Just getting started with ScrapingBot API ? No problem, we will introduce you to this magical world here you can find scraping code samples!

  • In case a request fails we will return a 200 code with a JSON populated with an error, you may retry the request Make sure to catch these errors! They will occur for hard to scrape websites.

  • If you exceed your plan concurrent connection limit, the API will respond with a 402 status code, this can be solved by slowing down your request rate

  • If you exceed your plan api calls, the API will respond with a 402 status code, this can be solved by upgrading your plan.

  • There is no overage allowed on the free plan, if you exceed 100 requests per month on the free plan, you will receive a 402 error.

  • Note that requests on Google count for 10 request credits and same if you use Premium Proxies.

Each request will return a string with raw JSON

  • The error field is null by default and is filled with a string if some error occurred

  • The data field contains by default all fields listed below

  • siteHtml is always null you can get it with advanced options example here 
{
    "error": null,
    "data": {
      "title": "Apple iPhone XR 64GB Red Unlocked A2105 GSM SEALED BOX- 1 Year Apple Warranty",
      "description": "Apple iPhone XR. 1 YEAR APPLE CARE WARRANTY.",
      "image": "https://www.scraping-bot.io/iphone_example_ebay_files/s-l500.png",
      "price": 689,
      "shippingFees": 18,
      "currency": "GBP",
      "isInStock": true,
      "EAN13": "0190198770660",
      "ASIN": null,
      "ISBN": null,
      "color": "White",
      "brand": "Apple",
      "category": {
        "name": "Mobile & Smart Phones",
        "url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
      },
      "categories": [
        {
          "name": "Mobile Phones & Communication",
          "url": "https://www.ebay.co.uk/b/Mobile-Phones-Communication-/15032"
        },
        {
          "name": "Mobile & Smart Phones",
          "url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
        }
      ],
      "siteURL": "https://www.ebay.co.uk/itm/Apple-iPhone-XR-64GB-Red-Unlocked-A2105-GSM-SEALED-BOX-1-Year-Apple-Warranty-/123902112947",
      "siteHtml": null,
      "productHasVariations": null,
      "error": null,
      "statusCode": null,
      "isFinished": null,
      "isDead": null,
      "htmlLength": 128016
    }
  }

Basic usage Raw HTML

You are using ScrapingBot for a basic usage ? You’ll find more information just here :
endpoint : POST  http://api.scraping-bot.io/scrape/raw-html
You only need to give us the website URL you want to scrape and you will get the fully rendered HTML of that page avoiding captcha and blocking.

If you need javascript rendering look at advanced options right here

var request = require('request');

var username = "yourUsername",
    apiKey = "yourApiKey",
    url = "https://www.scraping-bot.io/rawHtmlPage.html",
    auth = "Basic " + Buffer.from(username + ":" + apiKey).toString("base64");

request(
    {
        method: 'POST',
        url: 'http://api.scraping-bot.io/scrape/raw-html',
        json: {
            url: url
        },
        headers: {
            Accept: 'application/json',
            Authorization : auth
        },
    },
    function(error, response, body) {
        console.log(body);
    }
);

#!/bin/bash
url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);

curl -X POST \
  http://api.scraping-bot.io/scrape/raw-html \
  -H "Authorization: Basic $auth" \
  -H "Content-Type: application/json" \
  -d "{\"url\":\"$url\"}"

<?php
$userName="yourUsername";
$apiKey="yourApiKey";

$auth = base64_encode($userName.":".$apiKey);

$curl = curl_init();

curl_setopt_array($curl, array(
  CURLOPT_URL => "http://api.scraping-bot.io/scrape/raw-html",
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => "{\"url\":\"https://www.scraping-bot.io/rawHtmlPage.html\"}",
  CURLOPT_HTTPHEADER => array(
    "Authorization: Basic ".$auth,
    "Content-Type: application/json"
  ),
));

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  $info = curl_getinfo($curl);
  if($info["http_code"]>399){
    echo "HTTP Error #:" . $response;
  }else{
    echo $response;
  }
}

import requests
import json

url='https://www.scraping-bot.io/rawHtmlPage.html'
username = 'yourUsername'
apiKey = 'yourApiKey'

apiUrl = "http://api.scraping-bot.io/scrape/raw-html"

payload = json.dumps({"url":url})
headers = {
    'Content-Type': "application/json"
}

response = requests.request("POST", apiUrl, data=payload, auth=(username,apiKey), headers=headers)

print(response.text)

require 'uri'
require 'net/http'
require "base64"

url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'

auth=Base64.encode64(username+":"+api_key)

apiUrl = URI("http://api.scraping-bot.io/scrape/raw-html")

http = Net::HTTP.new(apiUrl.host, apiUrl.port)

request = Net::HTTP::Post.new(apiUrl)
request["Content-Type"] = 'application/json'
request["Authorization"] = 'Basic '+auth
request.body = "{\"url\":\""+url+"\"}"

response = http.request(request)
puts response.read_body

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;

public class RawJava {
    public static void main(String[] args) {
        try {
            String username = "yourUsername";
            String apiKey = "yourApiKey";
            String originalInput = username + ":" + apiKey;
            String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
            URL url = new URL("http://api.scraping-bot.io/scrape/raw-html");
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            con.setRequestMethod("POST");
            con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
            con.setRequestProperty("Authorization", encodedString);

            String param = "{\"url\":\"https://www.scraping-bot.io/rawHtmlPage.html\"}";

            con.setDoOutput(true);
            OutputStream out = con.getOutputStream();
            out.write(param.getBytes());
            out.flush();
            out.close();

            int status = con.getResponseCode();
            System.out.println(status);
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(con.getInputStream()));
            String inputLine;
            StringBuilder content = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                content.append(inputLine);
            }
            String jsonResponse = content.toString();
            System.out.println(jsonResponse);
            in.close();
            con.disconnect();

        } catch (Exception e) {
            System.out.println("An error occured while scraping:" + e);
        }
    }
}

using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;


var username="yourUsername";
var apiKey="yourApiKey";

var byteArray = Encoding.ASCII.GetBytes(username+":"+apiKey);
var auth = Convert.ToBase64String(byteArray);
var url = "https://www.scraping-bot.io/rawHtmlPage.html";
var apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
var values = new { url };
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
    "application/json");

HttpClient httpClient = new HttpClient();

httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);

var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;

Console.WriteLine(responseString);

Each request will return raw HTML content of the page

Result ⤵
<html>
  <head>
  </head>
  <body>
    <h1>Here is my title</h1>
    <p>
        This is the content of my paragraph
    </p>
  </body>
</html>

Advanced options

You choose the best usage of ScrapingBot ? I knew that you were the best. Here you can find details about the options you can choose.

Options : 

  • useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result WARNING this option consumes two api calls

  • premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)WARNING this option consumes 10 api calls and 20 calls if combined with js rendering

  • proxyCountry: String, set this option to one of the following values: ‘AM’, ‘AR’, ‘AT’, ‘AU’, ‘AZ’, ‘BE’, ‘BO’, ‘BR’, ‘BY’, ‘CA’ , ‘CH’, ‘CL’, ‘CN’, ‘CO’, ‘CZ’, ‘DE’, ‘DK’, ‘DO’, ‘EC’, ‘EE’, ‘ES’, ‘FI’, ‘ FR ‘,’ GB ‘,’ GE ‘,’ HK ‘,’ ID ‘,’ IE ‘,’ IL ‘,’ IN ‘,’ IT ‘,’ JM ‘,’ JP ‘,’ KG ‘,’ KH ‘ , ‘KR’, ‘KZ’, ‘LA’, ‘LK’, ‘LT’, ‘LU’, ‘LV’, ‘MD’, ‘MX’, ‘MY’, ‘NL’, ‘NO’, ‘ NZ ‘,’ PE ‘,’ PH ‘,’ RU ‘,’ SE ‘,’ SG ‘,’ TH ‘,’ TJ ‘,’ TM ‘,’ TR ‘,’ TW ‘,’ UA ‘,’ US ‘ , ‘UZ’, ‘VN’. It allows you to choose the location of the proxy. It can have several utilities. For example, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, some sites block depending on the location. So you can bypass this with this option

  • WaitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.

var request = require('request');

var username = "yourUsername",
    apiKey = "yourApiKey",
    url = "https://www.scraping-bot.io/rawHtmlPage.html",
    apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html",
    auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");

request(
    {
        method: 'POST',
        url: apiEndPoint,
        json: {
            url: url,
            options: {
                useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
                premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
                proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
                waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                              //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
            }
        },
        headers: {
            Accept: 'application/json',
            Authorization : auth
        },
    },
    function(error, response, body) {
        console.log(body);
    }
);

#!/bin/bash
url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null'  # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                               # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example

apiEndPoint='http://api.scraping-bot.io/scrape/raw-html'

curl -X POST \
  $apiEndPoint \
  -H "Authorization: Basic $auth" \
  -H "Content-Type: application/json" \
  -d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"

<?php
$userName="yourUsername";
$apiKey="yourApiKey";

$auth = base64_encode($userName.":".$apiKey);

$postParams = array(
    "url" => "https://www.scraping-bot.io/rawHtmlPage.html",
    'options' => array(
      "useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
      "premiumProxy" => false,  //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
      "proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
      "waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                        //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
    )
);

$apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";

$json = json_encode($postParams);


$curl = curl_init();

curl_setopt_array($curl, array(
  CURLOPT_URL => $apiEndPoint,
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => $json,
  CURLOPT_HTTPHEADER => array(
    "Authorization: Basic ".$auth,
    "Content-Type: application/json"
  ),
));

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

import requests
import json

url='https://www.scraping-bot.io/rawHtmlPage.html'
username = 'yourUsername'
apiKey = 'yourApiKey'

apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html"

options = {
    "useChrome": False,#set to True if you want to use headless chrome for javascript rendering
    "premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
    "proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
    "waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                   # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}

payload = json.dumps({"url":url,"options":options})
headers = {
    'Content-Type': "application/json"
}

response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)

print(response.text)

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;

public class AdvancedRawJava {
    public static void main(String[] args) {
        try {
            String username = "yourUsername";
            String apiKey = "yourApiKey";
            String originalInput = username + ":" + apiKey;
            String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());

            String apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
            URL url = new URL(apiEndPoint);
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            con.setRequestMethod("POST");
            con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
            con.setRequestProperty("Authorization", encodedString);

            String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
            String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
            String urlToScrape = "https://www.scraping-bot.io/rawHtmlPage.html";
            String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
            String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'

            String param = "{\"url\":\""+urlToScrape+"\","+
                                "\"options\":{"+
                                    "\"useChrome\":"+useChrome+","+
                                    "\"premiumProxy\":"+premiumProxy+","+
                                    "\"proxyCountry\":"+proxyCountry+","+
                                    "\"waitForNetworkRequests\":"+waitForNetworkRequests+
                                    "}"+
                            "}";

            con.setDoOutput(true);
            OutputStream out = con.getOutputStream();
            out.write(param.getBytes());
            out.flush();
            out.close();

            int status = con.getResponseCode();
            System.out.println(status);
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(con.getInputStream()));
            String inputLine;
            StringBuilder content = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                content.append(inputLine);
            }
            String jsonResponse = content.toString();
            System.out.println(jsonResponse);
            in.close();
            con.disconnect();

        } catch (Exception e) {
            System.out.println("An error occured while scraping:" + e);
        }
    }
}

using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;


var username = "yourUsername";
var apiKey = "yourApiKey";

var byteArray = Encoding.ASCII.GetBytes(username + ":" + apiKey);
var auth = Convert.ToBase64String(byteArray);
var apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";

var values = new
{
    url = "https://www.scraping-bot.io/rawHtmlPage.html",
    options = new
    {
        useChrome = false, //set to 'true' if you want to use headless chrome for javascript rendering
        premiumProxy = false,  //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
        //proxyCountry = "US", //allows you to choose a country proxy (example: proxyCountry="US")
        waitForNetworkRequests = false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                       //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
    }
};
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
    "application/json");

HttpClient httpClient = new HttpClient();

httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);

var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;

Console.WriteLine(responseString);

Retail API

You want to scrape Retail websites and don’t want to waste time on finding, price tags title, brand, color and many other properties full list of props here.

Stop wasting your precious time and use our Retail API , give us the product page you want to scrape and we will give you all the data already extracted

Endpoint : POST  http://api.scraping-bot.io/scrape/retail

Optional parameters : 

  • useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result

  • premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)

  • proxyCountry: String, set this option to one of the following values: ‘AM’, ‘AR’, ‘AT’, ‘AU’, ‘AZ’, ‘BE’, ‘BO’, ‘BR’, ‘BY’, ‘CA’ , ‘CH’, ‘CL’, ‘CN’, ‘CO’, ‘CZ’, ‘DE’, ‘DK’, ‘DO’, ‘EC’, ‘EE’, ‘ES’, ‘FI’, ‘ FR ‘,’ GB ‘,’ GE ‘,’ HK ‘,’ ID ‘,’ IE ‘,’ IL ‘,’ IN ‘,’ IT ‘,’ JM ‘,’ JP ‘,’ KG ‘,’ KH ‘ , ‘KR’, ‘KZ’, ‘LA’, ‘LK’, ‘LT’, ‘LU’, ‘LV’, ‘MD’, ‘MX’, ‘MY’, ‘NL’, ‘NO’, ‘ NZ ‘,’ PE ‘,’ PH ‘,’ RU ‘,’ SE ‘,’ SG ‘,’ TH ‘,’ TJ ‘,’ TM ‘,’ TR ‘,’ TW ‘,’ UA ‘,’ US ‘ , ‘UZ’, ‘VN’. It allows you to choose the location of the proxy. It can have several utilities. For example, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, some sites block depending on the location. So you can bypass this with this option

  • WaitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.

var request = require('request');

var username = "yourUsername",
    apiKey = "yourApiKey",
    url = "https://www.scraping-bot.io/example-ebay.html",
    apiEndPoint = "http://api.scraping-bot.io/scrape/retail",
    auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");

request(
    {
        method: 'POST',
        url: apiEndPoint,
        json: {
            url: url,
            options: {
                useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
                premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
                proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
                waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                              //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
            }
        },
        headers: {
            Accept: 'application/json',
            Authorization : auth
        },
    },
    function(error, response, body) {
        console.log(body);
    }
);

#!/bin/bash
url='https://www.scraping-bot.io/example-ebay.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null'  # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                               # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example

apiEndPoint='http://api.scraping-bot.io/scrape/retail'

curl -X POST \
  $apiEndPoint \
  -H "Authorization: Basic $auth" \
  -H "Content-Type: application/json" \
  -d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"

<?php
$userName="yourUsername";
$apiKey="yourApiKey";

$auth = base64_encode($userName.":".$apiKey);

$postParams = array(
    "url" => "https://www.scraping-bot.io/example-ebay.html",
    'options' => array(
      "useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
      "premiumProxy" => false,  //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
      "proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
      "waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                        //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
    )
);

$apiEndPoint = "http://api.scraping-bot.io/scrape/retail";

$json = json_encode($postParams);


$curl = curl_init();

curl_setopt_array($curl, array(
  CURLOPT_URL => $apiEndPoint,
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => $json,
  CURLOPT_HTTPHEADER => array(
    "Authorization: Basic ".$auth,
    "Content-Type: application/json"
  ),
));

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

import requests
import json

url='https://www.scraping-bot.io/example-ebay.html'
username = 'yourUsername'
apiKey = 'yourApiKey'

apiEndPoint = "http://api.scraping-bot.io/scrape/retail"

options = {
    "useChrome": False,#set to True if you want to use headless chrome for javascript rendering
    "premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
    "proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
    "waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                   # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}

payload = json.dumps({"url":url,"options":options})
headers = {
    'Content-Type': "application/json"
}

response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)

print(response.text)

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;

public class AdvancedRawJava {
    public static void main(String[] args) {
        try {
            String username = "yourUsername";
            String apiKey = "yourApiKey";
            String originalInput = username + ":" + apiKey;
            String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());

            String apiEndPoint = "http://api.scraping-bot.io/scrape/retail";
            URL url = new URL(apiEndPoint);
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            con.setRequestMethod("POST");
            con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
            con.setRequestProperty("Authorization", encodedString);

            String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
            String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
            String urlToScrape = "https://www.scraping-bot.io/example-ebay.html";
            String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
            String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'

            String param = "{\"url\":\""+urlToScrape+"\","+
                                "\"options\":{"+
                                    "\"useChrome\":"+useChrome+","+
                                    "\"premiumProxy\":"+premiumProxy+","+
                                    "\"proxyCountry\":"+proxyCountry+","+
                                    "\"waitForNetworkRequests\":"+waitForNetworkRequests+
                                    "}"+
                            "}";

            con.setDoOutput(true);
            OutputStream out = con.getOutputStream();
            out.write(param.getBytes());
            out.flush();
            out.close();

            int status = con.getResponseCode();
            System.out.println(status);
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(con.getInputStream()));
            String inputLine;
            StringBuilder content = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                content.append(inputLine);
            }
            String jsonResponse = content.toString();
            System.out.println(jsonResponse);
            in.close();
            con.disconnect();

        } catch (Exception e) {
            System.out.println("An error occured while scraping:" + e);
        }
    }
}

using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;


var username = "yourUsername";
var apiKey = "yourApiKey";

var byteArray = Encoding.ASCII.GetBytes(username + ":" + apiKey);
var auth = Convert.ToBase64String(byteArray);
var apiEndPoint = "http://api.scraping-bot.io/scrape/retail";

var values = new
{
    url = "https://www.scraping-bot.io/example-ebay.html",
    options = new
    {
        useChrome = false, //set to 'true' if you want to use headless chrome for javascript rendering
        premiumProxy = false,  //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
        //proxyCountry = "US", //allows you to choose a country proxy (example: proxyCountry="US")
        waitForNetworkRequests = false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                       //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
    }
};
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
    "application/json");

HttpClient httpClient = new HttpClient();

httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);

var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;

Console.WriteLine(responseString);

Each request will return a string with raw JSON

  • The error field is null by default and is filled with a string if some error occurred

  • The data field contains by default all fields listed below
Result example ⤵
{
    "error": null,
    "data": {
      "title": "Apple iPhone XR 64GB Red Unlocked A2105 GSM SEALED BOX- 1 Year Apple Warranty",
      "description": "Apple iPhone XR. 1 YEAR APPLE CARE WARRANTY.",
      "image": "https://www.scraping-bot.io/iphone_example_ebay_files/s-l500.png",
      "price": 689,
      "shippingFees": 18,
      "currency": "GBP",
      "isInStock": true,
      "EAN13": "0190198770660",
      "ASIN": null,
      "ISBN": null,
      "color": "White",
      "brand": "Apple",
      "category": {
        "name": "Mobile & Smart Phones",
        "url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
      },
      "categories": [
        {
          "name": "Mobile Phones & Communication",
          "url": "https://www.ebay.co.uk/b/Mobile-Phones-Communication-/15032"
        },
        {
          "name": "Mobile & Smart Phones",
          "url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
        }
      ],
      "siteURL": "https://www.ebay.co.uk/itm/Apple-iPhone-XR-64GB-Red-Unlocked-A2105-GSM-SEALED-BOX-1-Year-Apple-Warranty-/123902112947",
      "siteHtml": null,
      "productHasVariations": null,
      "error": null,
      "statusCode": null,
      "isFinished": null,
      "isDead": null,
      "htmlLength": 128016
    }
  }

Real Estate API

You want to scrape Real estate websites and don’t want to waste time on finding, price tags title, number of rooms, surfaceArea and many other properties full list of props here.

Stop wasting your precious time and use our Real estate API , give us the product page you want to scrape and we will give you all the data already extracted

Endpoint : POST  http://api.scraping-bot.io/scrape/real-estate

 

Optional parameters : 

  • useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result

  • premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)

  • proxyCountry: String, set this option to one of the following values: ‘AM’, ‘AR’, ‘AT’, ‘AU’, ‘AZ’, ‘BE’, ‘BO’, ‘BR’, ‘BY’, ‘CA’ , ‘CH’, ‘CL’, ‘CN’, ‘CO’, ‘CZ’, ‘DE’, ‘DK’, ‘DO’, ‘EC’, ‘EE’, ‘ES’, ‘FI’, ‘ FR ‘,’ GB ‘,’ GE ‘,’ HK ‘,’ ID ‘,’ IE ‘,’ IL ‘,’ IN ‘,’ IT ‘,’ JM ‘,’ JP ‘,’ KG ‘,’ KH ‘ , ‘KR’, ‘KZ’, ‘LA’, ‘LK’, ‘LT’, ‘LU’, ‘LV’, ‘MD’, ‘MX’, ‘MY’, ‘NL’, ‘NO’, ‘ NZ ‘,’ PE ‘,’ PH ‘,’ RU ‘,’ SE ‘,’ SG ‘,’ TH ‘,’ TJ ‘,’ TM ‘,’ TR ‘,’ TW ‘,’ UA ‘,’ US ‘ , ‘UZ’, ‘VN’. It allows you to choose the location of the proxy. It can have several utilities. For example, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, some sites block depending on the location. So you can bypass this with this option

  • WaitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.

var request = require('request');

var username = "yourUsername",
    apiKey = "yourApiKey",
    url = "https://www.scraping-bot.io/realEstate.html",
    apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate",
    auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");

request(
    {
        method: 'POST',
        url: apiEndPoint,
        json: {
            url: url,
            options: {
                useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
                premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
                proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
                waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                              //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
            }
        },
        headers: {
            Accept: 'application/json',
            Authorization : auth
        },
    },
    function(error, response, body) {
        console.log(body);
    }
);

#!/bin/bash
url='https://www.scraping-bot.io/realEstate.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null'  # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                               # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example

apiEndPoint='http://api.scraping-bot.io/scrape/real-estate'

curl -X POST \
  $apiEndPoint \
  -H "Authorization: Basic $auth" \
  -H "Content-Type: application/json" \
  -d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"

<?php
$userName="yourUsername";
$apiKey="yourApiKey";

$auth = base64_encode($userName.":".$apiKey);

$postParams = array(
    "url" => "https://www.scraping-bot.io/realEstate.html",
    'options' => array(
      "useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
      "premiumProxy" => false,  //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
      "proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
      "waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                        //this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
    )
);

$apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate";

$json = json_encode($postParams);


$curl = curl_init();

curl_setopt_array($curl, array(
  CURLOPT_URL => $apiEndPoint,
  CURLOPT_RETURNTRANSFER => true,
  CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
  CURLOPT_CUSTOMREQUEST => "POST",
  CURLOPT_POSTFIELDS => $json,
  CURLOPT_HTTPHEADER => array(
    "Authorization: Basic ".$auth,
    "Content-Type: application/json"
  ),
));

$response = curl_exec($curl);
$err = curl_error($curl);

curl_close($curl);

if ($err) {
  echo "cURL Error #:" . $err;
} else {
  echo $response;
}

import requests
import json

url='https://www.scraping-bot.io/realEstate.html'
username = 'yourUsername'
apiKey = 'yourApiKey'

apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate"

options = {
    "useChrome": False,#set to True if you want to use headless chrome for javascript rendering
    "premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
    "proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
    "waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
                                   # this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}

payload = json.dumps({"url":url,"options":options})
headers = {
    'Content-Type': "application/json"
}

response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)

print(response.text)

import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;

public class AdvancedRawJava {
    public static void main(String[] args) {
        try {
            String username = "yourUsername";
            String apiKey = "yourApiKey";
            String originalInput = username + ":" + apiKey;
            String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());

            String apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate";
            URL url = new URL(apiEndPoint);
            HttpURLConnection con = (HttpURLConnection) url.openConnection();
            con.setRequestMethod("POST");
            con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
            con.setRequestProperty("Authorization", encodedString);

            String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
            String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
            String urlToScrape = "https://www.scraping-bot.io/realEstate.html";
            String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
            String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'

            String param = "{\"url\":\""+urlToScrape+"\","+
                                "\"options\":{"+
                                    "\"useChrome\":"+useChrome+","+
                                    "\"premiumProxy\":"+premiumProxy+","+
                                    "\"proxyCountry\":"+proxyCountry+","+
                                    "\"waitForNetworkRequests\":"+waitForNetworkRequests+
                                    "}"+
                            "}";

            con.setDoOutput(true);
            OutputStream out = con.getOutputStream();
            out.write(param.getBytes());
            out.flush();
            out.close();

            int status = con.getResponseCode();
            System.out.println(status);
            BufferedReader in = new BufferedReader(
                    new InputStreamReader(con.getInputStream()));
            String inputLine;
            StringBuilder content = new StringBuilder();
            while ((inputLine = in.readLine()) != null) {
                content.append(inputLine);
            }
            String jsonResponse = content.toString();
            System.out.println(jsonResponse);
            in.close();
            con.disconnect();

        } catch (Exception e) {
            System.out.println("An error occured while scraping:" + e);
        }
    }
}

Each request will return a string with raw JSON

  • The error field is null by default and is filled with a string if some error occurred

  • The data field contains by default all fields listed below
Result example ⤵
{
  "error": null,
  "data": {
    "title": "Location Studio Montpellier - 415€/mois - appartement F1/T1/1 pièce 18m²",
    "description": "34000 : Appartement disponible le 22 / 01 / 2020 Situé à MONTPELLIER (34000) rue des amaryllis, proche arceaux, du tramway ligne 3 et de la ligne de bus n°10, cet appartement est un T1 de 18,35 m² comprenant une pièce de vie entièrement repeinte avec une kitchenette neuve, ...",
    "surfaceArea": 18,
    "surfaceAreaUnit": "sqm",
    "price": null,
    "currency": "EUR",
    "numberOfRooms": 1,
    "numberOfBedrooms": 0,
    "publishingDate": null,
    "monthlyRent": 415,
    "weeklyRent": null,
    "marketedBy": {
      "name": "GUY HOQUET BPMI",
      "address": "1 Place Pierre Renaudel  34080 Montpellier",
      "phoneNumber": "0411280090"
    },
    "energyClass": "E:284",
    "greenhouseGazClass": null,
    "siteURL": "https://www.seloger.com/annonces/locations/appartement/montpellier-34/les-cevennes/154966057.htm",
    "siteHtml": null,
    "error": null,
    "statusCode": null,
    "htmlLength": 181005,
    "captchaFound": false,
    "isHtmlPage": true,
    "host": "www.seloger.com",
    "codeinsee": "340172"
  }
}

Build a web crawler

In this article, we explain the benefits of a Web Crawler associated to a Scraping API, and the rules to build an efficient one.
Find here a crawler example using ScrapingBot API with only two dependencies: request and cheerio.
You need to use at least nodeJS 8 because of the usage of await/async.

 

const request = require("request");
const util = require("util");
const rp = util.promisify(request);
const sleep = util.promisify(setTimeout);
const cheerio = require('cheerio');
const { URL } = require('url');

let seenLinks = {};

let rootNode = {};
let currentNode = {};

let linksQueue = [];
let printList = [];

let previousDepth = 0;
let maxCrawlingDepth = 5;

let options = null;
let mainDomain = null;
let mainParsedUrl = null;

class CreateLink {
  constructor(linkURL, depth, parent) {
    this.url = linkURL;
    this.depth = depth;
    this.parent = parent;
    this.children = [];
  }
}
//your scraping bot credentials
let username = "yourUsername",
    apiKey = "yourApiKey",
    apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html",
    auth = "Basic " + Buffer.from(username + ":" + apiKey).toString("base64");

let requestOptions = {
  method: 'POST',
  url: apiEndPoint,
  json: {
    url: "this will be replaced in the findLinks function",
    //scraing-bot options
      options: {
          useChrome:false, //if you want to use headless chrome WARNING two api calls wiil be consumed for this option
          premiumProxy:false, //if you want to use premium proxies Unblock Amazon,linkedIn (consuming 10 calls)
      }
  },
  headers: {
      Accept: 'application/json',
      Authorization : auth
  }
}

//Start Application put here the adress where you want to start your crawling with
//second parameter is depth with 1 it will scrape all the links found on the first page but not the ones found on other pages
//if you put 2 it will scrape all links on first page and all links found on second level pages be careful with this on a huge website it will represent tons of pages to scrape
// it is recommanded to limit to 5 levels
crawlBFS("https://www.scraping-bot.io/", 1);

async function crawlBFS(startURL, maxDepth = 5) {
  try {
    mainParsedUrl = new URL(startURL);
  } catch (e) {
    console.log("URL is not valid", e);
    return;
  }

  mainDomain = mainParsedUrl.hostname;

  maxCrawlingDepth = maxDepth;
  startLinkObj = new CreateLink(startURL, 0, null);
  rootNode = currentNode = startLinkObj;
  addToLinkQueue(currentNode);
  await findLinks(currentNode);
}

//
async function crawl(linkObj) {
  //Add logs here if needed!
  //console.log(`Checking URL: ${options.url}`);
  await findLinks(linkObj);
}

//The goal is to get the HTML and look for the links inside the page.
async function findLinks(linkObj) {
  //lets set the url we wnt to scrape
  requestOptions.json.url = linkObj.url
  console.log("Scraping URL : " + linkObj.url);
  let response
  try {
    response = await rp(requestOptions);
    if (response.statusCode !== 200) {
      if (response.statusCode === 401 || response.statusCode === 405) {
        console.log("autentication failed check your credentials");
      } else {
        console.log("an error occurred check the URL" + response.statusCode, response.body);
      }
      return 
    }
    //response.body is the whole content of the page if you want to store some kind of data from the web page you should do it here
    let $ = cheerio.load(response.body);
    let links = $('body').find('a').filter(function (i, el) {
      return $(this).attr('href') != null;
    }).map(function (i, x) {
      return $(this).attr('href');
    });
    if (links.length > 0) {
      links.map(function (i, x) {
        let reqLink = checkDomain(x);
        if (reqLink) {
          if (reqLink != linkObj.url) {
            newLinkObj = new CreateLink(reqLink, linkObj.depth + 1, linkObj);
            addToLinkQueue(newLinkObj);
          }
        }
      });
    } else {
      console.log("No more links found for " + requestOptions.url);
    }
    let nextLinkObj = getNextInQueue();
    if (nextLinkObj && nextLinkObj.depth <= maxCrawlingDepth) {
      //random sleep
      //It is very important to make this long enough to avoid spamming the website you want to scrape
      //if you choose a short time you will potentially be blocked or kill the website you want to crawl
      //time is in milliseconds here
      let minimumWaitTime = 500; //half a second these values are very low on a real worl example you should use at least 30000 (30 seconds between each call) 
      let maximumWaitTime = 5000 //max five seconds
      let waitTime = Math.round(minimumWaitTime + (Math.random() * (maximumWaitTime-minimumWaitTime)));
      console.log("wait for " + waitTime + " milliseconds");
      await sleep(waitTime);
      //next url scraping
      await crawl(nextLinkObj);
    } else {
      setRootNode();
      printTree();
    }
  } catch (err) {
    console.log("Something Went Wrong...", err);
  }
}

//Go all the way up and set RootNode to the parent node
function setRootNode() {
  while (currentNode.parent != null) {
    currentNode = currentNode.parent;
  }
  rootNode = currentNode;
}

function printTree() {
  addToPrintDFS(rootNode);
  console.log(printList.join("\n|"));
}

function addToPrintDFS(node) {
  let spaces = Array(node.depth * 3).join("-");
  printList.push(spaces + node.url);
  if (node.children) {
    node.children.map(function (i, x) {
      {
        addToPrintDFS(i);
      }
    });
  }
}

//Check if the domain belongs to the site being checked
function checkDomain(linkURL) {
  let parsedUrl;
  let fullUrl = true;
  try {
    parsedUrl = new URL(linkURL);
  } catch (error) {
    fullUrl = false;
  }
  if (fullUrl === false) {
    if (linkURL.indexOf("/") === 0) {
      //relative to domain url
      return mainParsedUrl.protocol + "//" + mainParsedUrl.hostname + linkURL.split("#")[0];
    } else if (linkURL.indexOf("#") === 0) {
      //anchor avoid link
      return
    } else {
      //relative url
      let path = currentNode.url.match('.*\/')[0]
      return path + linkURL;
    }
  }

  let mainHostDomain = parsedUrl.hostname;

  if (mainDomain == mainHostDomain) {
    //console.log("returning Full Link: " + linkURL);
    parsedUrl.hash = "";
    return parsedUrl.href;
  } else {
    return;
  }
}

function addToLinkQueue(linkobj) {
  if (!linkInSeenListExists(linkobj)) {
    if (linkobj.parent != null) {
      linkobj.parent.children.push(linkobj);
    }
    linksQueue.push(linkobj);
    addToSeen(linkobj);
  }
}

function getNextInQueue() {
  let nextLink = linksQueue.shift();
  if (nextLink && nextLink.depth > previousDepth) {
    previousDepth = nextLink.depth;
    console.log(`------- CRAWLING ON DEPTH LEVEL ${previousDepth} --------`);
  }
  return nextLink;
}

function peekInQueue() {
  return linksQueue[0];
}

//Adds links we've visited to the seenList
function addToSeen(linkObj) {
  seenLinks[linkObj.url] = linkObj;
}

//Returns whether the link has been seen.
function linkInSeenListExists(linkObj) {
  return seenLinks[linkObj.url] == null ? false : true;
}

Need to contact us ?

Please fill this form and make your dreams come true !

reCAPTCHA