Web Scraping Documentation
Documentation
Documentation, best practice and usage of our web scraping API
And see it for yourself !
IMPORTANT : If you are testing on a retail site, do not paste the URL of the homepage or a page with multiple products, you have to paste a single product page URL 😉
WARNING : The live test is currently not available for the Social Media APIs, you can still test it for free using Postman (or other similar software) using the informations in the documentation
Introduction
As an introduction we want let you know that If you have any question regarding ScrapingBot, or need help with how to configure/setup the API calls, feel free to contact us and we’ll do our best your help you get started.
Similarly, if you’re not getting the expected scraping result from a website or if you want us to add support for retail(or real-estate) website we do not currently support, let us know, we’ll look into finding possible solutions.
Getting started
API endpoints :
POST http://api.scraping-bot.io/scrape/raw-html ➡️ Read documentation here
POST http://api.scraping-bot.io/scrape/retail ➡️ Read documentation here
POST http://api.scraping-bot.io/scrape/real-estate ➡️ Read documentation here
Hey ! Just getting started with ScrapingBot API ? No problem, we will introduce you to this magical world here you can find scraping code samples!
- In case a request fails we will return a 200 code with a JSON populated with an error, you may retry the request Make sure to catch these errors! They will occur for hard to scrape websites.
- If you exceed your plan concurrent connection limit, the API will respond with a 429 status code, this can be solved by slowing down your request rate
- If you exceed your plan api calls, the API will respond with a 402 status code, this can be solved by upgrading your plan.
- There is no overage allowed on the free plan, if you exceed 100 requests per month on the free plan, you will receive a 402 error.
- Note that requests on Google count for 10 request credits and same if you use Premium Proxies.
Each request will return a string with raw JSON
(except raw-html endpoint)
- The error field is null by default and is filled with a string if some error occurred
- The data field contains by default all fields listed below
- siteHtml is always null you can get it with advanced options example here
{
"error": null,
"data": {
"title": "Apple iPhone XR 64GB Red Unlocked A2105 GSM SEALED BOX- 1 Year Apple Warranty",
"description": "Apple iPhone XR. 1 YEAR APPLE CARE WARRANTY.",
"image": "https://www.scraping-bot.io/iphone_example_ebay_files/s-l500.png",
"price": 689,
"shippingFees": 18,
"currency": "GBP",
"isInStock": true,
"EAN13": "0190198770660",
"ASIN": null,
"ISBN": null,
"color": "White",
"brand": "Apple",
"category": {
"name": "Mobile & Smart Phones",
"url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
},
"categories": [
{
"name": "Mobile Phones & Communication",
"url": "https://www.ebay.co.uk/b/Mobile-Phones-Communication-/15032"
},
{
"name": "Mobile & Smart Phones",
"url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
}
],
"siteURL": "https://www.ebay.co.uk/itm/Apple-iPhone-XR-64GB-Red-Unlocked-A2105-GSM-SEALED-BOX-1-Year-Apple-Warranty-/123902112947",
"siteHtml": null,
"productHasVariations": null,
"error": null,
"statusCode": null,
"isFinished": null,
"isDead": null,
"htmlLength": 128016
}
}
Basic usage Raw HTML
You are using ScrapingBot for a basic usage ? You’ll find more information just here :
endpoint : POST http://api.scraping-bot.io/scrape/raw-html
You only need to give us the website URL you want to scrape and you will get the fully rendered HTML of that page avoiding captcha and blocking.
If you need javascript rendering look at advanced options right here
var request = require('request');
var username = "yourUsername",
apiKey = "yourApiKey",
url = "https://www.scraping-bot.io/rawHtmlPage.html",
auth = "Basic " + Buffer.from(username + ":" + apiKey).toString("base64");
request(
{
method: 'POST',
url: 'http://api.scraping-bot.io/scrape/raw-html',
json: {
url: url
},
headers: {
Accept: 'application/json',
Authorization : auth
},
},
function(error, response, body) {
console.log(body);
}
);
#!/bin/bash
url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
curl -X POST \
http://api.scraping-bot.io/scrape/raw-html \
-H "Authorization: Basic $auth" \
-H "Content-Type: application/json" \
-d "{\"url\":\"$url\"}"
<?php
$userName="yourUsername";
$apiKey="yourApiKey";
$auth = base64_encode($userName.":".$apiKey);
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => "http://api.scraping-bot.io/scrape/raw-html",
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
CURLOPT_POSTFIELDS => "{\"url\":\"https://www.scraping-bot.io/rawHtmlPage.html\"}",
CURLOPT_HTTPHEADER => array(
"Authorization: Basic ".$auth,
"Content-Type: application/json"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
$info = curl_getinfo($curl);
if($info["http_code"]>399){
echo "HTTP Error #:" . $response;
}else{
echo $response;
}
}
import requests
import json
url='https://www.scraping-bot.io/rawHtmlPage.html'
username = 'yourUsername'
apiKey = 'yourApiKey'
apiUrl = "http://api.scraping-bot.io/scrape/raw-html"
payload = json.dumps({"url":url})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiUrl, data=payload, auth=(username,apiKey), headers=headers)
print(response.text)
require 'uri'
require 'net/http'
require "base64"
url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'
auth=Base64.encode64(username+":"+api_key)
apiUrl = URI("http://api.scraping-bot.io/scrape/raw-html")
http = Net::HTTP.new(apiUrl.host, apiUrl.port)
request = Net::HTTP::Post.new(apiUrl)
request["Content-Type"] = 'application/json'
request["Authorization"] = 'Basic '+auth
request.body = "{\"url\":\""+url+"\"}"
response = http.request(request)
puts response.read_body
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class RawJava {
public static void main(String[] args) {
try {
String username = "yourUsername";
String apiKey = "yourApiKey";
String originalInput = username + ":" + apiKey;
String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
URL url = new URL("http://api.scraping-bot.io/scrape/raw-html");
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("POST");
con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
con.setRequestProperty("Authorization", encodedString);
String param = "{\"url\":\"https://www.scraping-bot.io/rawHtmlPage.html\"}";
con.setDoOutput(true);
OutputStream out = con.getOutputStream();
out.write(param.getBytes());
out.flush();
out.close();
int status = con.getResponseCode();
System.out.println(status);
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
String jsonResponse = content.toString();
System.out.println(jsonResponse);
in.close();
con.disconnect();
} catch (Exception e) {
System.out.println("An error occured while scraping:" + e);
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;
var username="yourUsername";
var apiKey="yourApiKey";
var byteArray = Encoding.ASCII.GetBytes(username+":"+apiKey);
var auth = Convert.ToBase64String(byteArray);
var url = "https://www.scraping-bot.io/rawHtmlPage.html";
var apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
var values = new { url };
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
"application/json");
HttpClient httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);
var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;
Console.WriteLine(responseString);
Each request will return raw HTML content of the page
Result ⤵
<html>
<head>
</head>
<body>
<h1>Here is my title</h1>
<p>
This is the content of my paragraph
</p>
</body>
</html>
Advanced options
You choose the best usage of ScrapingBot ? I knew that you were the best. Here you can find details about the options you can choose.
Options :
useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result WARNING this option consumes two api calls
- premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)WARNING this option consumes 10 api calls and 20 calls if combined with js rendering
- proxyCountry: String, set this option to one of the following values: “AE”,”AL”,”AM”,”AR”,”AT”,”AU”,”AZ”,”BA”,”BD”,”BE”,”BG”,”BO”,”BR”,”BY”,”CA”,”CH”,”CL” ,”CN”,”CO”, “CR”,”CY”,”CZ”,”DE”, “DK”,”DO”,”EC”,”EE”,”EG”, “ES”,”FI”,”FK”,”FR”,”GB”,”GE”,”GR”,”GS”,”HK”,”HR”,”HU”, “ID”,”IE”,”IL”,”IM”,”IN”,”IQ”,”IS”,”IT”, “JM”,”JO”,”JP”,”KE”,”KG”,”KH”,”KR”,”KW”,”KZ”,”LA”, “LK”,”LT”,”LU”,”LV”,”MA”,”MD”,”MK”, “MS”,”MX”,”MY”,”NG”,”NL”,”NO”,”NZ”, “OM”,”PA”,”PE”,”PH”, “PK”,”PL”,”PT”,”RO”, “RS”, “RU”, “SA”,”SE”,”SG”,”SI”,”SK”,”SL”,”TH”,”TJ”, “TM”, “TN”,”TR”,”TW”,”UA”, “US”, “UZ”, “VN”, “ZA”.
This list is valid when not using the premiumProxy option, see below for a list of country code you can use when using the premiumProxy option.
It allows you to choose the location of the proxy. It can have several use cases. For instance, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, there are some sites which block access depending on the location the user is coming from. So you can bypass this with this option - waitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.
// When using the premiumProxy option you have access to a larger pool of country
// Keep in mind it's always better to not use a proxyCountry option unless necessary as it limits the
// pool of usable IP and can in turn lower the success rate of requests depending on the website
// Example usage in your call options when calling the api => "proxyCountry": "US"
[
{
"name": "Andorra",
"code": "AD"
},
{
"name": "United Arab Emirates",
"code": "AE"
},
{
"name": "Afghanistan",
"code": "AF"
},
{
"name": "Antigua And Barbuda",
"code": "AG"
},
{
"name": "Anguilla",
"code": "AI"
},
{
"name": "Albania",
"code": "AL"
},
{
"name": "Armenia",
"code": "AM"
},
{
"name": "Angola",
"code": "AO"
},
{
"name": "Argentina",
"code": "AR"
},
{
"name": "American Samoa",
"code": "AS"
},
{
"name": "Austria",
"code": "AT"
},
{
"name": "Australia",
"code": "AU"
},
{
"name": "Aruba",
"code": "AW"
},
{
"name": "Aland Islands",
"code": "AX"
},
{
"name": "Azerbaijan",
"code": "AZ"
},
{
"name": "Bosnia And Herzegovina",
"code": "BA"
},
{
"name": "Barbados",
"code": "BB"
},
{
"name": "Bangladesh",
"code": "BD"
},
{
"name": "Belgium",
"code": "BE"
},
{
"name": "Burkina Faso",
"code": "BF"
},
{
"name": "Bulgaria",
"code": "BG"
},
{
"name": "Bahrain",
"code": "BH"
},
{
"name": "Burundi",
"code": "BI"
},
{
"name": "Benin",
"code": "BJ"
},
{
"name": "Saint Barthelemy",
"code": "BL"
},
{
"name": "Bermuda",
"code": "BM"
},
{
"name": "Brunei",
"code": "BN"
},
{
"name": "Bolivia",
"code": "BO"
},
{
"name": "Bonaire (Caribbean Netherlands)",
"code": "BQ"
},
{
"name": "Brazil",
"code": "BR"
},
{
"name": "Bahamas",
"code": "BS"
},
{
"name": "Bhutan",
"code": "BT"
},
{
"name": "Botswana",
"code": "BW"
},
{
"name": "Belarus",
"code": "BY"
},
{
"name": "Belize",
"code": "BZ"
},
{
"name": "Canada",
"code": "CA"
},
{
"name": "Democratic Republic Of Congo (Zaire)",
"code": "CD"
},
{
"name": "Central African Republic",
"code": "CF"
},
{
"name": "Congo",
"code": "CG"
},
{
"name": "Switzerland",
"code": "CH"
},
{
"name": "Cote D'Ivoire (Ivory Coast)",
"code": "CI"
},
{
"name": "Cook Islands",
"code": "CK"
},
{
"name": "Chile",
"code": "CL"
},
{
"name": "Cameroon",
"code": "CM"
},
{
"name": "China",
"code": "CN"
},
{
"name": "Colombia",
"code": "CO"
},
{
"name": "Costa Rica",
"code": "CR"
},
{
"name": "Cuba",
"code": "CU"
},
{
"name": "Cape Verde",
"code": "CV"
},
{
"name": "Curacao",
"code": "CW"
},
{
"name": "Christmas Island",
"code": "CX"
},
{
"name": "Cyprus",
"code": "CY"
},
{
"name": "Czech Republic",
"code": "CZ"
},
{
"name": "Germany",
"code": "DE"
},
{
"name": "Djibouti",
"code": "DJ"
},
{
"name": "Denmark",
"code": "DK"
},
{
"name": "Dominica",
"code": "DM"
},
{
"name": "Dominican Republic",
"code": "DO"
},
{
"name": "Algeria",
"code": "DZ"
},
{
"name": "Ecuador",
"code": "EC"
},
{
"name": "Estonia",
"code": "EE"
},
{
"name": "Egypt",
"code": "EG"
},
{
"name": "Western Sahara",
"code": "EH"
},
{
"name": "Eritrea",
"code": "ER"
},
{
"name": "Spain",
"code": "ES"
},
{
"name": "Ethiopia",
"code": "ET"
},
{
"name": "Finland",
"code": "FI"
},
{
"name": "Fiji",
"code": "FJ"
},
{
"name": "Falkland Islands (Malvinas)",
"code": "FK"
},
{
"name": "Micronesia",
"code": "FM"
},
{
"name": "Faroe Islands",
"code": "FO"
},
{
"name": "France",
"code": "FR"
},
{
"name": "Gabon",
"code": "GA"
},
{
"name": "Great Britain",
"code": "GB"
},
{
"name": "Grenada",
"code": "GD"
},
{
"name": "Georgia",
"code": "GE"
},
{
"name": "French Guiana",
"code": "GF"
},
{
"name": "Guernsey",
"code": "GG"
},
{
"name": "Ghana",
"code": "GH"
},
{
"name": "Gibraltar",
"code": "GI"
},
{
"name": "Greenland",
"code": "GL"
},
{
"name": "Gambia",
"code": "GM"
},
{
"name": "Guinea",
"code": "GN"
},
{
"name": "Guadeloupe",
"code": "GP"
},
{
"name": "Equatorial Guinea",
"code": "GQ"
},
{
"name": "Greece",
"code": "GR"
},
{
"name": "Guatemala",
"code": "GT"
},
{
"name": "Guam",
"code": "GU"
},
{
"name": "Guinea-Bissau",
"code": "GW"
},
{
"name": "Guyana",
"code": "GY"
},
{
"name": "Hong Kong",
"code": "HK"
},
{
"name": "Honduras",
"code": "HN"
},
{
"name": "Croatia (Hrvatska)",
"code": "HR"
},
{
"name": "Haiti",
"code": "HT"
},
{
"name": "Hungary",
"code": "HU"
},
{
"name": "Indonesia",
"code": "ID"
},
{
"name": "Ireland",
"code": "IE"
},
{
"name": "Israel",
"code": "IL"
},
{
"name": "Isle of Man",
"code": "IM"
},
{
"name": "India",
"code": "IN"
},
{
"name": "Iceland",
"code": "IS"
},
{
"name": "Italy",
"code": "IT"
},
{
"name": "Bailiwick of Jersey",
"code": "JE"
},
{
"name": "Jamaica",
"code": "JM"
},
{
"name": "Jordan",
"code": "JO"
},
{
"name": "Japan",
"code": "JP"
},
{
"name": "Kenya",
"code": "KE"
},
{
"name": "Kyrgyzstan",
"code": "KG"
},
{
"name": "Cambodia",
"code": "KH"
},
{
"name": "Kiribati",
"code": "KI"
},
{
"name": "Comoros",
"code": "KM"
},
{
"name": "Saint Kitts And Nevis",
"code": "KN"
},
{
"name": "South Korea",
"code": "KR"
},
{
"name": "Kuwait",
"code": "KW"
},
{
"name": "Cayman Islands",
"code": "KY"
},
{
"name": "Kazakhstan",
"code": "KZ"
},
{
"name": "Laos",
"code": "LA"
},
{
"name": "Saint Lucia",
"code": "LC"
},
{
"name": "Liechtenstein",
"code": "LI"
},
{
"name": "Sri Lanka",
"code": "LK"
},
{
"name": "Liberia",
"code": "LR"
},
{
"name": "Lesotho",
"code": "LS"
},
{
"name": "Lithuania",
"code": "LT"
},
{
"name": "Luxembourg",
"code": "LU"
},
{
"name": "Latvia",
"code": "LV"
},
{
"name": "Libya",
"code": "LY"
},
{
"name": "Morocco",
"code": "MA"
},
{
"name": "Monaco",
"code": "MC"
},
{
"name": "Moldova",
"code": "MD"
},
{
"name": "Montenegro",
"code": "ME"
},
{
"name": "Saint Martin",
"code": "MF"
},
{
"name": "Madagascar",
"code": "MG"
},
{
"name": "Marshall Islands",
"code": "MH"
},
{
"name": "Macedonia",
"code": "MK"
},
{
"name": "Mali",
"code": "ML"
},
{
"name": "Myanmar (Burma)",
"code": "MM"
},
{
"name": "Mongolia",
"code": "MN"
},
{
"name": "Macau",
"code": "MO"
},
{
"name": "Northern Mariana Islands",
"code": "MP"
},
{
"name": "Martinique",
"code": "MQ"
},
{
"name": "Mauritania",
"code": "MR"
},
{
"name": "Montserrat",
"code": "MS"
},
{
"name": "Malta",
"code": "MT"
},
{
"name": "Mauritius",
"code": "MU"
},
{
"name": "Maldives",
"code": "MV"
},
{
"name": "Malawi",
"code": "MW"
},
{
"name": "Mexico",
"code": "MX"
},
{
"name": "Malaysia",
"code": "MY"
},
{
"name": "Mozambique",
"code": "MZ"
},
{
"name": "Namibia",
"code": "NA"
},
{
"name": "New Caledonia",
"code": "NC"
},
{
"name": "Niger",
"code": "NE"
},
{
"name": "Norfolk Island",
"code": "NF"
},
{
"name": "Nigeria",
"code": "NG"
},
{
"name": "Nicaragua",
"code": "NI"
},
{
"name": "Netherlands",
"code": "NL"
},
{
"name": "Norway",
"code": "NO"
},
{
"name": "Nepal",
"code": "NP"
},
{
"name": "Nauru",
"code": "NR"
},
{
"name": "Niue",
"code": "NU"
},
{
"name": "New Zealand",
"code": "NZ"
},
{
"name": "Oman",
"code": "OM"
},
{
"name": "Panama",
"code": "PA"
},
{
"name": "Peru",
"code": "PE"
},
{
"name": "French Polynesia",
"code": "PF"
},
{
"name": "Papua New Guinea",
"code": "PG"
},
{
"name": "Philippines",
"code": "PH"
},
{
"name": "Pakistan",
"code": "PK"
},
{
"name": "Poland",
"code": "PL"
},
{
"name": "Puerto Rico",
"code": "PR"
},
{
"name": "Palestine",
"code": "PS"
},
{
"name": "Portugal",
"code": "PT"
},
{
"name": "Palau",
"code": "PW"
},
{
"name": "Paraguay",
"code": "PY"
},
{
"name": "Qatar",
"code": "QA"
},
{
"name": "Réunion",
"code": "RE"
},
{
"name": "Romania",
"code": "RO"
},
{
"name": "Serbia",
"code": "RS"
},
{
"name": "Russia",
"code": "RU"
},
{
"name": "Rwanda",
"code": "RW"
},
{
"name": "Saudi Arabia",
"code": "SA"
},
{
"name": "Solomon Islands",
"code": "SB"
},
{
"name": "Seychelles",
"code": "SC"
},
{
"name": "Sudan",
"code": "SD"
},
{
"name": "Sweden",
"code": "SE"
},
{
"name": "Singapore",
"code": "SG"
},
{
"name": "Slovenia",
"code": "SI"
},
{
"name": "Slovak Republic",
"code": "SK"
},
{
"name": "Sierra Leone",
"code": "SL"
},
{
"name": "San Marino",
"code": "SM"
},
{
"name": "Senegal",
"code": "SN"
},
{
"name": "Somalia",
"code": "SO"
},
{
"name": "Suriname",
"code": "SR"
},
{
"name": "South Sudan",
"code": "SS"
},
{
"name": "Sao Tome And Principe",
"code": "ST"
},
{
"name": "El Salvador",
"code": "SV"
},
{
"name": "Sint Maarten (Dutch part)",
"code": "SX"
},
{
"name": "Swaziland",
"code": "SZ"
},
{
"name": "Turks And Caicos Islands",
"code": "TC"
},
{
"name": "Chad",
"code": "TD"
},
{
"name": "Togo",
"code": "TG"
},
{
"name": "Thailand",
"code": "TH"
},
{
"name": "Tajikistan",
"code": "TJ"
},
{
"name": "Timor-Leste",
"code": "TL"
},
{
"name": "Turkmenistan",
"code": "TM"
},
{
"name": "Tunisia",
"code": "TN"
},
{
"name": "Tonga",
"code": "TO"
},
{
"name": "Turkey",
"code": "TR"
},
{
"name": "Trinidad And Tobago",
"code": "TT"
},
{
"name": "Tuvalu",
"code": "TV"
},
{
"name": "Taiwan",
"code": "TW"
},
{
"name": "Tanzania",
"code": "TZ"
},
{
"name": "Ukraine",
"code": "UA"
},
{
"name": "Uganda",
"code": "UG"
},
{
"name": "United States",
"code": "US"
},
{
"name": "Uruguay",
"code": "UY"
},
{
"name": "Uzbekistan",
"code": "UZ"
},
{
"name": "Vatican City (Holy See)",
"code": "VA"
},
{
"name": "Saint Vincent And The Grenadines",
"code": "VC"
},
{
"name": "Venezuela",
"code": "VE"
},
{
"name": "Virgin Islands (British)",
"code": "VG"
},
{
"name": "Virgin Islands (US)",
"code": "VI"
},
{
"name": "Vietnam",
"code": "VN"
},
{
"name": "Vanuatu",
"code": "VU"
},
{
"name": "Wallis And Futuna Islands",
"code": "WF"
},
{
"name": "Western Samoa",
"code": "WS"
},
{
"name": "Kosovo",
"code": "XK"
},
{
"name": "Yemen",
"code": "YE"
},
{
"name": "Mayotte",
"code": "YT"
},
{
"name": "South Africa",
"code": "ZA"
},
{
"name": "Zambia",
"code": "ZM"
},
{
"name": "Zimbabwe",
"code": "ZW"
}
]
var request = require('request');
var username = "yourUsername",
apiKey = "yourApiKey",
url = "https://www.scraping-bot.io/rawHtmlPage.html",
apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
request(
{
method: 'POST',
url: apiEndPoint,
json: {
url: url,
options: {
useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
},
headers: {
Accept: 'application/json',
Authorization : auth
},
},
function(error, response, body) {
console.log(body);
}
);
#!/bin/bash
url='https://www.scraping-bot.io/rawHtmlPage.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null' # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
apiEndPoint='http://api.scraping-bot.io/scrape/raw-html'
curl -X POST \
$apiEndPoint \
-H "Authorization: Basic $auth" \
-H "Content-Type: application/json" \
-d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"
<?php
$userName="yourUsername";
$apiKey="yourApiKey";
$auth = base64_encode($userName.":".$apiKey);
$postParams = array(
"url" => "https://www.scraping-bot.io/rawHtmlPage.html",
'options' => array(
"useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
"premiumProxy" => false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
)
);
$apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
$json = json_encode($postParams);
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $apiEndPoint,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
CURLOPT_POSTFIELDS => $json,
CURLOPT_HTTPHEADER => array(
"Authorization: Basic ".$auth,
"Content-Type: application/json"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
import requests
import json
url='https://www.scraping-bot.io/rawHtmlPage.html'
username = 'yourUsername'
apiKey = 'yourApiKey'
apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html"
options = {
"useChrome": False,#set to True if you want to use headless chrome for javascript rendering
"premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
payload = json.dumps({"url":url,"options":options})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)
print(response.text)
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class AdvancedRawJava {
public static void main(String[] args) {
try {
String username = "yourUsername";
String apiKey = "yourApiKey";
String originalInput = username + ":" + apiKey;
String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
String apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
URL url = new URL(apiEndPoint);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("POST");
con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
con.setRequestProperty("Authorization", encodedString);
String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
String urlToScrape = "https://www.scraping-bot.io/rawHtmlPage.html";
String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'
String param = "{\"url\":\""+urlToScrape+"\","+
"\"options\":{"+
"\"useChrome\":"+useChrome+","+
"\"premiumProxy\":"+premiumProxy+","+
"\"proxyCountry\":"+proxyCountry+","+
"\"waitForNetworkRequests\":"+waitForNetworkRequests+
"}"+
"}";
con.setDoOutput(true);
OutputStream out = con.getOutputStream();
out.write(param.getBytes());
out.flush();
out.close();
int status = con.getResponseCode();
System.out.println(status);
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
String jsonResponse = content.toString();
System.out.println(jsonResponse);
in.close();
con.disconnect();
} catch (Exception e) {
System.out.println("An error occured while scraping:" + e);
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;
var username = "yourUsername";
var apiKey = "yourApiKey";
var byteArray = Encoding.ASCII.GetBytes(username + ":" + apiKey);
var auth = Convert.ToBase64String(byteArray);
var apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html";
var values = new
{
url = "https://www.scraping-bot.io/rawHtmlPage.html",
options = new
{
useChrome = false, //set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy = false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
//proxyCountry = "US", //allows you to choose a country proxy (example: proxyCountry="US")
waitForNetworkRequests = false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
};
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
"application/json");
HttpClient httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);
var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;
Console.WriteLine(responseString);
Retail API
You want to scrape Retail websites and don’t want to waste time on finding, price tags title, brand, color and many other properties full list of props here.
Stop wasting your precious time and use our Retail API , give us the product page you want to scrape and we will give you all the data already extracted
Endpoint : POST http://api.scraping-bot.io/scrape/retail
Optional parameters :
useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result
- premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)
- proxyCountry: String, set this option to one of the following values: “AE”,”AL”,”AM”,”AR”,”AT”,”AU”,”AZ”,”BA”,”BD”,”BE”,”BG”,”BO”,”BR”,”BY”,”CA”,”CH”,”CL” ,”CN”,”CO”, “CR”,”CY”,”CZ”,”DE”, “DK”,”DO”,”EC”,”EE”,”EG”, “ES”,”FI”,”FK”,”FR”,”GB”,”GE”,”GR”,”GS”,”HK”,”HR”,”HU”, “ID”,”IE”,”IL”,”IM”,”IN”,”IQ”,”IS”,”IT”, “JM”,”JO”,”JP”,”KE”,”KG”,”KH”,”KR”,”KW”,”KZ”,”LA”, “LK”,”LT”,”LU”,”LV”,”MA”,”MD”,”MK”, “MS”,”MX”,”MY”,”NG”,”NL”,”NO”,”NZ”, “OM”,”PA”,”PE”,”PH”, “PK”,”PL”,”PT”,”RO”, “RS”, “RU”, “SA”,”SE”,”SG”,”SI”,”SK”,”SL”,”TH”,”TJ”, “TM”, “TN”,”TR”,”TW”,”UA”, “US”, “UZ”, “VN”, “ZA”.
This list is valid when not using the premiumProxy option, see above (in Advanced options section) for a list of country code you can use when using the premiumProxy option.
It allows you to choose the location of the proxy. It can have several utilities. For example, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, some sites block depending on the location. So you can bypass this with this option - WaitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.
var request = require('request');
var username = "yourUsername",
apiKey = "yourApiKey",
url = "https://www.scraping-bot.io/example-ebay.html",
apiEndPoint = "http://api.scraping-bot.io/scrape/retail",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
request(
{
method: 'POST',
url: apiEndPoint,
json: {
url: url,
options: {
useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
},
headers: {
Accept: 'application/json',
Authorization : auth
},
},
function(error, response, body) {
console.log(body);
}
);
#!/bin/bash
url='https://www.scraping-bot.io/example-ebay.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null' # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
apiEndPoint='http://api.scraping-bot.io/scrape/retail'
curl -X POST \
$apiEndPoint \
-H "Authorization: Basic $auth" \
-H "Content-Type: application/json" \
-d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"
<?php
$userName="yourUsername";
$apiKey="yourApiKey";
$auth = base64_encode($userName.":".$apiKey);
$postParams = array(
"url" => "https://www.scraping-bot.io/example-ebay.html",
'options' => array(
"useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
"premiumProxy" => false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
)
);
$apiEndPoint = "http://api.scraping-bot.io/scrape/retail";
$json = json_encode($postParams);
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $apiEndPoint,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
CURLOPT_POSTFIELDS => $json,
CURLOPT_HTTPHEADER => array(
"Authorization: Basic ".$auth,
"Content-Type: application/json"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
import requests
import json
url='https://www.scraping-bot.io/example-ebay.html'
username = 'yourUsername'
apiKey = 'yourApiKey'
apiEndPoint = "http://api.scraping-bot.io/scrape/retail"
options = {
"useChrome": False,#set to True if you want to use headless chrome for javascript rendering
"premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
payload = json.dumps({"url":url,"options":options})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)
print(response.text)
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class AdvancedRawJava {
public static void main(String[] args) {
try {
String username = "yourUsername";
String apiKey = "yourApiKey";
String originalInput = username + ":" + apiKey;
String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
String apiEndPoint = "http://api.scraping-bot.io/scrape/retail";
URL url = new URL(apiEndPoint);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("POST");
con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
con.setRequestProperty("Authorization", encodedString);
String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
String urlToScrape = "https://www.scraping-bot.io/example-ebay.html";
String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'
String param = "{\"url\":\""+urlToScrape+"\","+
"\"options\":{"+
"\"useChrome\":"+useChrome+","+
"\"premiumProxy\":"+premiumProxy+","+
"\"proxyCountry\":"+proxyCountry+","+
"\"waitForNetworkRequests\":"+waitForNetworkRequests+
"}"+
"}";
con.setDoOutput(true);
OutputStream out = con.getOutputStream();
out.write(param.getBytes());
out.flush();
out.close();
int status = con.getResponseCode();
System.out.println(status);
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
String jsonResponse = content.toString();
System.out.println(jsonResponse);
in.close();
con.disconnect();
} catch (Exception e) {
System.out.println("An error occured while scraping:" + e);
}
}
}
using System;
using System.Net.Http;
using System.Text;
using System.Threading.Tasks;
using Newtonsoft.Json;
var username = "yourUsername";
var apiKey = "yourApiKey";
var byteArray = Encoding.ASCII.GetBytes(username + ":" + apiKey);
var auth = Convert.ToBase64String(byteArray);
var apiEndPoint = "http://api.scraping-bot.io/scrape/retail";
var values = new
{
url = "https://www.scraping-bot.io/example-ebay.html",
options = new
{
useChrome = false, //set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy = false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
//proxyCountry = "US", //allows you to choose a country proxy (example: proxyCountry="US")
waitForNetworkRequests = false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
};
var json = JsonConvert.SerializeObject(values);
var content = new StringContent(json, Encoding.UTF8,
"application/json");
HttpClient httpClient = new HttpClient();
httpClient.DefaultRequestHeaders.Authorization = new System.Net.Http.Headers.AuthenticationHeaderValue("Basic", auth);
var response = httpClient.PostAsync(apiEndPoint, content).Result;
var responseString = response.Content.ReadAsStringAsync().Result;
Console.WriteLine(responseString);
Each request will return a string with raw JSON
- The error field is null by default and is filled with a string if some error occurred
- The data field contains by default all fields listed below
Result example ⤵
{
"error": null,
"data": {
"title": "Apple iPhone XR 64GB Red Unlocked A2105 GSM SEALED BOX- 1 Year Apple Warranty",
"description": "Apple iPhone XR. 1 YEAR APPLE CARE WARRANTY.",
"image": "https://www.scraping-bot.io/iphone_example_ebay_files/s-l500.png",
"price": 689,
"shippingFees": 18,
"currency": "GBP",
"isInStock": true,
"EAN13": "0190198770660",
"ASIN": null,
"ISBN": null,
"color": "White",
"brand": "Apple",
"category": {
"name": "Mobile & Smart Phones",
"url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
},
"categories": [
{
"name": "Mobile Phones & Communication",
"url": "https://www.ebay.co.uk/b/Mobile-Phones-Communication-/15032"
},
{
"name": "Mobile & Smart Phones",
"url": "https://www.ebay.co.uk/b/Mobile-Smart-Phones-/9355"
}
],
"siteURL": "https://www.ebay.co.uk/itm/Apple-iPhone-XR-64GB-Red-Unlocked-A2105-GSM-SEALED-BOX-1-Year-Apple-Warranty-/123902112947",
"siteHtml": null,
"productHasVariations": null,
"error": null,
"statusCode": null,
"isFinished": null,
"isDead": null,
"htmlLength": 128016
}
}
Real Estate API
You want to scrape Real estate websites and don’t want to waste time on finding, price tags title, number of rooms, surfaceArea and many other properties full list of props here.
Stop wasting your precious time and use our Real estate API , give us the product page you want to scrape and we will give you all the data already extracted
Endpoint : POST http://api.scraping-bot.io/scrape/real-estate
Optional parameters :
useChrome : Boolean, set this option to true if you want to use headless chrome that is able to render Javascript and get full result
- premiumProxy : Boolean, set this option to true to use Premium proxy pool (better for Amazon, Rakuten, Google etc…)
- proxyCountry: String, set this option to one of the following values: “AE”,”AL”,”AM”,”AR”,”AT”,”AU”,”AZ”,”BA”,”BD”,”BE”,”BG”,”BO”,”BR”,”BY”,”CA”,”CH”,”CL” ,”CN”,”CO”, “CR”,”CY”,”CZ”,”DE”, “DK”,”DO”,”EC”,”EE”,”EG”, “ES”,”FI”,”FK”,”FR”,”GB”,”GE”,”GR”,”GS”,”HK”,”HR”,”HU”, “ID”,”IE”,”IL”,”IM”,”IN”,”IQ”,”IS”,”IT”, “JM”,”JO”,”JP”,”KE”,”KG”,”KH”,”KR”,”KW”,”KZ”,”LA”, “LK”,”LT”,”LU”,”LV”,”MA”,”MD”,”MK”, “MS”,”MX”,”MY”,”NG”,”NL”,”NO”,”NZ”, “OM”,”PA”,”PE”,”PH”, “PK”,”PL”,”PT”,”RO”, “RS”, “RU”, “SA”,”SE”,”SG”,”SI”,”SK”,”SL”,”TH”,”TJ”, “TM”, “TN”,”TR”,”TW”,”UA”, “US”, “UZ”, “VN”, “ZA”.
This list is valid when not using the premiumProxy option, see above (in Advanced options section) for a list of country code you can use when using the premiumProxy option.
It allows you to choose the location of the proxy. It can have several utilities. For example, some sites define the currency according to the location of the IP address. So you can choose the currency in which you want the scraping. Another example, some sites block depending on the location. So you can bypass this with this option - WaitForNetworkRequests: Boolean, set to true if you want to wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true), this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example.
var request = require('request');
var username = "yourUsername",
apiKey = "yourApiKey",
url = "https://www.scraping-bot.io/realEstate.html",
apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
request(
{
method: 'POST',
url: apiEndPoint,
json: {
url: url,
options: {
useChrome:false, //set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy:false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry:null, //allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests:false, //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
},
headers: {
Accept: 'application/json',
Authorization : auth
},
},
function(error, response, body) {
console.log(body);
}
);
#!/bin/bash
url='https://www.scraping-bot.io/realEstate.html'
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
useChrome='false' #set to 'true' if you want to use headless chrome for javascript rendering
premiumProxy='false' #set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
proxyCountry='null' # allows you to choose a country proxy (example: proxyCountry:"FR")
waitForNetworkRequests='false' # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
apiEndPoint='http://api.scraping-bot.io/scrape/real-estate'
curl -X POST \
$apiEndPoint \
-H "Authorization: Basic $auth" \
-H "Content-Type: application/json" \
-d "{\"url\":\"$url\",\"options\":{\"useChrome\":$useChrome,\"premiumProxy\":$premiumProxy,\"proxyCountry\":$proxyCountry,\"waitForNetworkRequests\":$waitForNetworkRequests}}"
<?php
$userName="yourUsername";
$apiKey="yourApiKey";
$auth = base64_encode($userName.":".$apiKey);
$postParams = array(
"url" => "https://www.scraping-bot.io/realEstate.html",
'options' => array(
"useChrome" => false, //set to 'true' if you want to use headless chrome for javascript rendering
"premiumProxy" => false, //set to 'true' if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry" => null, //allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests" => false //wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
//this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
)
);
$apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate";
$json = json_encode($postParams);
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $apiEndPoint,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
CURLOPT_POSTFIELDS => $json,
CURLOPT_HTTPHEADER => array(
"Authorization: Basic ".$auth,
"Content-Type: application/json"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
import requests
import json
url='https://www.scraping-bot.io/realEstate.html'
username = 'yourUsername'
apiKey = 'yourApiKey'
apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate"
options = {
"useChrome": False,#set to True if you want to use headless chrome for javascript rendering
"premiumProxy": False, # set to True if you want to use premium proxies Unblock Amazon,Google,Rakuten
"proxyCountry": None, # allows you to choose a country proxy (example: proxyCountry:"FR")
"waitForNetworkRequests":False # wait for most ajax requests to finish until returning the Html content (this option can only be used if useChrome is set to true),
# this can slowdown or fail your scraping if some requests are never ending only use if really needed to get some price loaded asynchronously for example
}
payload = json.dumps({"url":url,"options":options})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiEndPoint, data=payload, auth=(username,apiKey), headers=headers)
print(response.text)
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class AdvancedRawJava {
public static void main(String[] args) {
try {
String username = "yourUsername";
String apiKey = "yourApiKey";
String originalInput = username + ":" + apiKey;
String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
String apiEndPoint = "http://api.scraping-bot.io/scrape/real-estate";
URL url = new URL(apiEndPoint);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("POST");
con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
con.setRequestProperty("Authorization", encodedString);
String useChrome = "false";//set to "true" if you want to use headless chrome for javascript rendering
String premiumProxy = "false";//set to "true" if you want to use premium proxies Unblock Amazon,Google,Rakuten
String urlToScrape = "https://www.scraping-bot.io/realEstate.html";
String proxyCountry = null;//allows you to choose a country proxy (example: proxyCountry:"FR")
String waitForNetworkRequests = "false";//set to 'true' if you want to use 'networkidle2'
String param = "{\"url\":\""+urlToScrape+"\","+
"\"options\":{"+
"\"useChrome\":"+useChrome+","+
"\"premiumProxy\":"+premiumProxy+","+
"\"proxyCountry\":"+proxyCountry+","+
"\"waitForNetworkRequests\":"+waitForNetworkRequests+
"}"+
"}";
con.setDoOutput(true);
OutputStream out = con.getOutputStream();
out.write(param.getBytes());
out.flush();
out.close();
int status = con.getResponseCode();
System.out.println(status);
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
String jsonResponse = content.toString();
System.out.println(jsonResponse);
in.close();
con.disconnect();
} catch (Exception e) {
System.out.println("An error occured while scraping:" + e);
}
}
}
Each request will return a string with raw JSON
- The error field is null by default and is filled with a string if some error occurred
- The data field contains by default all fields listed below
Result example ⤵
{
"error": null,
"data": {
"title": "Location Studio Montpellier - 415€/mois - appartement F1/T1/1 pièce 18m²",
"description": "34000 : Appartement disponible le 22 / 01 / 2020 Situé à MONTPELLIER (34000) rue des amaryllis, proche arceaux, du tramway ligne 3 et de la ligne de bus n°10, cet appartement est un T1 de 18,35 m² comprenant une pièce de vie entièrement repeinte avec une kitchenette neuve, ...",
"surfaceArea": 18,
"surfaceAreaUnit": "sqm",
"price": null,
"currency": "EUR",
"numberOfRooms": 1,
"numberOfBedrooms": 0,
"publishingDate": null,
"monthlyRent": 415,
"weeklyRent": null,
"marketedBy": {
"name": "GUY HOQUET BPMI",
"address": "1 Place Pierre Renaudel 34080 Montpellier",
"phoneNumber": "0411280090"
},
"energyClass": "E:284",
"greenhouseGazClass": null,
"siteURL": "https://www.seloger.com/annonces/locations/appartement/montpellier-34/les-cevennes/154966057.htm",
"siteHtml": null,
"error": null,
"statusCode": null,
"htmlLength": 181005,
"captchaFound": false,
"isHtmlPage": true,
"host": "www.seloger.com",
"codeinsee": "340172"
}
}
Search Engine API
You want to scrape Google or Bing search results ? This API endpoint is for you.
Our search engine web scraping API has been built to extract data from the results page of Google or Bing. Create your request within the parameters, and get the data you need in the format you want.
Endpoint : POST http://api.scraping-bot.io/scrape/search-engine
Request parameters :
-
engine: String, the search engine you want to use, can be “google” or “bing”
- searchType : String, right now the only value possible is “search”, later on we’ll add more option for different type of search.
- search: String, The search you want to do (you can use google search operators such as: title:, site: …)
- format: String, can be “json”, “json-html” (for json format with raw html in a property), or “html” for raw html only
- domainCountry: String, set this option to simulate the country of origin for the search request, available codes: for Google, for Bing
- resultLang: String, set this option to have the search results in a specific language, available lang: for Google, for Bing
- jobOffers: Boolean (default false), will return the job offers matching the search parameter
- uule: String, allow more precise geotargeting, works great with job offers to get offers from a specific region or city
var request = require('request');
var username = "yourUsername",
apiKey = "yourApiKey",
apiEndPoint = "http://api.scraping-bot.io/scrape/search-engine",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
request(
{
method: 'POST',
url: apiEndPoint,
json: {
engine: "google", // google, bing
searchType: "search",
search: "pineapple on pizza",
format: "json", // json, json-html, html
domainCountry: "US",
resultLang: "EN",
jobOffers: false, // true or false, optional for google only, default false
uule: "w+CAIQICIiQnJvb2tseW4sQ29ubmVjdGljdXQsVW5pdGVkIFN0YXRlcw"// optional for google only
},
headers: {
Accept: 'application/json',
Authorization : auth
},
},
function(error, response, body) {
console.log(body);
}
);
#!/bin/bash
username='yourUsername'
api_key='yourApiKey'
auth=$(echo -ne "$username:$api_key" | base64);
#parameters
engine='google' # google, bing
searchType='search'
search='pineapple on pizza'
format='json' # json, json-html, html
domainCountry='US'
resultLang='EN'
jobOffers='false' # true or false, optional for google only, default false
uule='w+CAIQICIiQnJvb2tseW4sQ29ubmVjdGljdXQsVW5pdGVkIFN0YXRlcw' # optional for google only
apiEndPoint='http://api.scraping-bot.io/scrape/search-engine'
curl -X POST \
$apiEndPoint \
-H "Authorization: Basic $auth" \
-H "Content-Type: application/json" \
-d "{\"engine\":\"$engine\",\"searchType\":\"$searchType\",\"search\":\"$search\",\"format\":\"$format\",\"domainCountry\":\"$domainCountry\",\"resultLang\":\"$resultLang\",\"jobOffers\":$jobOffers,\"uule\":\"$uule\"}"
<?php
$userName="yourUsername";
$apiKey="yourApiKey";
$auth = base64_encode($userName.":".$apiKey);
$postParams = array(
"engine" => "google", // google, bing
"searchType" => "search",
"search" => "pineapple on pizza",
"format" => "json", // json, json-html, html
"domainCountry" => "US",
"resultLang" => "EN",
"jobOffers" => false, // true or false, optional for google only, default false
"uule" => "w+CAIQICIiQnJvb2tseW4sQ29ubmVjdGljdXQsVW5pdGVkIFN0YXRlcw"// optional for google only
);
$apiEndPoint = "http://host.docker.internal:4000/scrape/search-engine";
$json = json_encode($postParams);
$curl = curl_init();
curl_setopt_array($curl, array(
CURLOPT_URL => $apiEndPoint,
CURLOPT_RETURNTRANSFER => true,
CURLOPT_HTTP_VERSION => CURL_HTTP_VERSION_1_1,
CURLOPT_CUSTOMREQUEST => "POST",
CURLOPT_POSTFIELDS => $json,
CURLOPT_HTTPHEADER => array(
"Authorization: Basic ".$auth,
"Content-Type: application/json"
),
));
$response = curl_exec($curl);
$err = curl_error($curl);
curl_close($curl);
if ($err) {
echo "cURL Error #:" . $err;
} else {
echo $response;
}
import requests
import json
username = 'yourUsername'
apiKey = 'yourApiKey'
apiUrl = "http://api.scraping-bot.io/scrape/search-engine"
body = {"engine": "google", "searchType": "search", "search": "pineapple on pizza", "format": "json", "domainCountry": "US", "resultLang": "EN", "jobOffers": False, "uule": "w+CAIQICIiQnJvb2tseW4sQ29ubmVjdGljdXQsVW5pdGVkIFN0YXRlcw"}
payload = json.dumps(body)
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiUrl, data=payload, auth=(username,apiKey), headers=headers)
print(response.text.encode('utf-8'))
import java.io.BufferedReader;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.HttpURLConnection;
import java.net.URL;
import java.util.Base64;
public class AdvancedRawJava {
public static void main(String[] args) {
try {
String username = "yourUsername";
String apiKey = "yourApiKey";
String originalInput = username + ":" + apiKey;
String encodedString = "Basic " + Base64.getEncoder().encodeToString(originalInput.getBytes());
String apiEndPoint = "http://api.scraping-bot.io/scrape/search-engine";
URL url = new URL(apiEndPoint);
HttpURLConnection con = (HttpURLConnection) url.openConnection();
con.setRequestMethod("POST");
con.setRequestProperty("Content-Type", "application/json; charset=UTF-8");
con.setRequestProperty("Authorization", encodedString);
String engine = "google"; // google, bing
String searchType = "search";
String search = "pineapple on pizza";
String format = "json"; // json, json-html, html
String domainCountry = "US";
String resultLang = "EN";
String jobOffers = "false"; // true or false, optional for google only, default false
String uule = "w+CAIQICIiQnJvb2tseW4sQ29ubmVjdGljdXQsVW5pdGVkIFN0YXRlcw";// optional for google only
String param = "{\"engine\":\""+engine+"\","+
"\"searchType\":\""+searchType+"\","+
"\"search\":\""+search+"\","+
"\"format\":\""+format+"\","+
"\"domainCountry\":\""+domainCountry+"\","+
"\"resultLang\":\""+resultLang+"\","+
"\"jobOffers\":"+jobOffers+","+
"\"uule\":\""+uule+
"\"}";
con.setDoOutput(true);
OutputStream out = con.getOutputStream();
out.write(param.getBytes());
out.flush();
out.close();
int status = con.getResponseCode();
System.out.println(status);
BufferedReader in = new BufferedReader(
new InputStreamReader(con.getInputStream()));
String inputLine;
StringBuilder content = new StringBuilder();
while ((inputLine = in.readLine()) != null) {
content.append(inputLine);
}
String jsonResponse = content.toString();
System.out.println(jsonResponse);
in.close();
con.disconnect();
} catch (Exception e) {
System.out.println("An error occured while scraping:" + e);
}
}
}
Each request will return a string with raw JSON or HTML depending on the "format" param
Result example ⤵
{
"general": {
"search_engine": "google",
"query": "pineapple on pizza",
"results_cnt": 176000000,
"search_time": 0.64,
"language": "en",
"location": "Brooklyn, Connecticut",
"mobile": false,
"basic_view": false,
"search_type": "text",
"page_title": "pineapple on pizza - Google Search",
"code_version": "1.354",
"timestamp": "2021-06-21T09:31:02.865Z"
},
"organic": [
{
"link": "https://www.goodfood.com.au/eat-out/news/why-do-so-many-people-find-pineapple-on-pizza-offensive-20190424-h1dqrc",
"display_link": "https://www.goodfood.com.au › eat-out › news › why-do...",
"title": "Why do so many people find pineapple on pizza offensive?",
"description": "Apr 24, 2019 — Those who live in the Northeast or are older than 55 hate pineapple toppings even more. The pizza delivery app Slice conducted a survey in ...",
"rank": 1,
"global_rank": 1
},
{
"link": "https://www.washingtonpost.com/news/voraciously/wp/2019/04/23/pineapple-on-pizza-is-easy-to-hate-at-least-in-theory/",
"display_link": "https://www.washingtonpost.com › news › 2019/04/23",
"title": "Pineapple on pizza is easy to hate — at least in theory - The ...",
"description": "Apr 23, 2019 — Too many people, it seems, have been burned by poor versions of the Hawaiian, in which thick chunks of syrupy pineapple are merely dumped ...",
"rank": 2,
"global_rank": 6
},
{
"link": "https://www.independent.co.uk/life-style/food-and-drink/pizza-pineapple-is-it-acceptable-chefs-expert-opinion-a8074571.html",
"display_link": "https://www.independent.co.uk › ... › Food and Drink",
"title": "Is pineapple on pizza acceptable? Chefs weigh in - The ...",
"description": "Apr 9, 2019 — \"To those who say pineapple isn't an acceptable pizza topping because it doesn't taste good, they probably haven't had it done right. Raw chunks ...",
"rank": 3,
"global_rank": 7
},
{
"link": "https://en.wikipedia.org/wiki/Hawaiian_pizza",
"display_link": "https://en.wikipedia.org › wiki › Hawaiian_pizza",
"title": "Hawaiian pizza - Wikipedia",
"description": "Hawaiian pizza is a type of pizza originating in Canada, best known for having pineapple and either ham or bacon as toppings. Hawaiian pizza is commonly ...",
"rank": 4,
"global_rank": 8
},
{
"link": "https://soyummy.com/pineapple-pizza-history/",
"display_link": "https://soyummy.com › pineapple-pizza-history",
"title": "Pineapple Pizza: History Of The Controversial Hawaiian Pizza",
"description": "Oct 22, 2019 — In 2014, Thrillist published a list of the best and worst toppings. ... Pineapple was ranked last. “Poor, misunderstood pineapple. It unanimously ...",
"rank": 5,
"global_rank": 9
},
{
"link": "http://spizzicorestaurant.com/pizza-debate/",
"display_link": "http://spizzicorestaurant.com › pizza-debate",
"title": "To Pineapple or To Not: A Pizza Debate - Spizzico Italian ...",
"description": "Mar 15, 2019 — When asked why they love pineapple on pizza, most would say, “because it tastes good.” It's just that simple. But, more specifically, they like the ...",
"rank": 6,
"global_rank": 10
},
{
"link": "https://www.bbc.co.uk/bitesize/articles/z2vftrd",
"display_link": "https://www.bbc.co.uk › bitesize › articles",
"title": "Pineapple on pizza: Where did it come from and why do ... - BBC",
"description": "Speaking to food website La Cucina Italiana, Pepe said he thought the reason many were against pineapple was because it clashed too much with the base sauce ...",
"rank": 7,
"global_rank": 11
},
{
"link": "https://spoonuniversity.com/lifestyle/ending-the-debate-why-pineapple-belongs-on-pizza",
"display_link": "https://spoonuniversity.com › Lifestyle",
"title": "Ending the Debate: Why Pineapple Belongs on Pizza - Spoon ...",
"description": "First of all, it is scientifically proven that pineapple belongs on pizza. There is nothing better than a little bit of sweetness to cut through a salty snack like pizza.",
"rank": 8,
"global_rank": 12
}
],
"images": [
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRDNREZzSen-Gx_epXsao44hN1frSwExGZacU9Ek4cLvQ&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.tastingtable.com/dine/national/hawaiian-pineapple-pizza-history",
"rank": 1,
"global_rank": 16
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRUTeOW1gdYoAQsa8c7LKZxQAibt3kQp9_pWYWaSPMmtQ&s",
"image_alt": "Image result for pineapple on pizza",
"link": "http://spizzicorestaurant.com/pizza-debate/",
"rank": 2,
"global_rank": 17
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRamIUqUZLUBBxEJWQC4FWQqHZnQKgdbwucz7FMzNhQxg&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.theguardian.com/lifeandstyle/2018/may/12/rachel-cooke-pineapple-hawaiian-pizza-food-snob-authentic",
"rank": 3,
"global_rank": 18
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQg1JVQl120-457SBJooG6qXomFlrIAMN8wu_oEDpm3wg&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.independent.co.uk/life-style/food-and-drink/pizza-pineapple-is-it-acceptable-chefs-expert-opinion-a8074571.html",
"rank": 4,
"global_rank": 19
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQZe-B-a0ZJEkEQm4PHhrKZ6EO8OBR1cYx6_N2jjA4gGw&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.scmp.com/lifestyle/food-drink/article/3001463/pineapple-pizza-worlds-best-pizzaiolo-approves-and-hes",
"rank": 5,
"global_rank": 20
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSQIUoDRBt9fR55Ep5exrRlxotm7oO5Fap8tLWx9RNnlA&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.theguardian.com/lifeandstyle/2017/jun/10/sam-panapoulos-inventor-of-hawaiian-pizza-dies-aged-83",
"rank": 6,
"global_rank": 21
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRuZQ-5OBs93K6cBAnDw62g3Rd4w3B2yUGRfesR8NrJuQ&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.goodfood.com.au/eat-out/news/why-do-so-many-people-find-pineapple-on-pizza-offensive-20190424-h1dqrc",
"rank": 7,
"global_rank": 22
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ_zAKTOQ_MuDaJEijs7IDO_mYqRfQzkcKDWmAwEJffIg&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://www.today.com/food/pineapple-acceptable-pizza-topping-america-decides-t118980",
"rank": 8,
"global_rank": 23
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQRJ21_R1kIdN590_UcmQuluZ5EVzZsuqU0Tg15CVq--g&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://thspublications.com/opinion/2019/04/29/pineapple-pizza-a-disgrace-to-humanity/",
"rank": 9,
"global_rank": 24
},
{
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcSQrUIHPRah3cZLUyNgPqirYk751XjR6Jk2094658k0Ow&s",
"image_alt": "Image result for pineapple on pizza",
"link": "https://today.yougov.com/topics/food/articles-reports/2019/02/04/pineapple-pizza-toppings-pepperoni-popular",
"rank": 10,
"global_rank": 25
}
],
"recipes": {
"title": "Recipes",
"items": [
{
"title": "Hawaiian Pizza",
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcRWd_HNHmUq6X4vZLi0qoLEvGT2zTtLCHvCLoKL1ONtdcKKp2LS-lBU&s=0",
"link": "https://sallysbakingaddiction.com/hawaiian-pizza/",
"rating": 5,
"reviews_cnt": 2,
"source": "Sallys Baking Addiction",
"cook_time": "2 hr 30 min",
"ingredients": [
"Canadian bacon",
"homemade pizza crust",
"pizza sauce",
"mozzarella cheese",
"pineapple chunks"
],
"rank": 1,
"global_rank": 13
},
{
"title": "Hawaiian Pizza",
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcS81xZBKFdaheFSSz5WEKDgl6cqwJC5CjuJnwp5Kcjy1O_cUJDzAzEU&s=0",
"link": "https://www.thespruceeats.com/hawaiian-pizza-4691857",
"rating": 3.8,
"reviews_cnt": 29,
"source": "The Spruce Eats",
"cook_time": "19 min",
"ingredients": [
"Pizza dough",
"pizza sauce",
"mozzarella cheese",
"cooked ham",
"pineapple chunks"
],
"rank": 2,
"global_rank": 14
},
{
"title": "Homemade Hawaiian Pizza",
"image": "https://encrypted-tbn0.gstatic.com/images?q=tbn:ANd9GcQ4YRb3vYH5KBeAD9pjLGc3H5wKoLjel23KTmzocTpArrdbzDJjixXR&s=0",
"link": "https://www.crazyforcrust.com/hawaiian-pizza-homemade-pizza-recipe/",
"rating": 5,
"reviews_cnt": 2,
"source": "Crazy for Crust",
"cook_time": "35 min",
"ingredients": [
"Canadian bacon",
"pizza sauce",
"mozzarella cheese",
"olive oil",
"yeast"
],
"rank": 3,
"global_rank": 15
}
]
},
"pagination": {
"current_page": 1,
"next_page_link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=10&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8NMDegQIARBJ",
"next_page_start": 10,
"next_page": 2,
"pages": [
{
"page": 2,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=10&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARA3",
"start": 10
},
{
"page": 3,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=20&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARA5",
"start": 20
},
{
"page": 4,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=30&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARA7",
"start": 30
},
{
"page": 5,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=40&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARA9",
"start": 40
},
{
"page": 6,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=50&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARA_",
"start": 50
},
{
"page": 7,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=60&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARBB",
"start": 60
},
{
"page": 8,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=70&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARBD",
"start": 70
},
{
"page": 9,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=80&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARBF",
"start": 80
},
{
"page": 10,
"link": "https://www.google.com/search?q=pineapple+on+pizza&hl=en&gl=US&ei=VFzQYI3EGsbj_Aai6ZnABA&start=90&sa=N&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQ8tMDegQIARBH",
"start": 90
}
]
},
"people_also_ask": [
{
"question": "Why pineapple on pizza is bad?",
"question_link": "https://www.google.com/search?hl=en&gl=US&q=Why+pineapple+on+pizza+is+bad%3F&sa=X&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQzmd6BAgIEAs",
"answer_html": "<div id=\"5\"></div>",
"rank": 1,
"global_rank": 2
},
{
"question": "Should Pineapple be on pizza?",
"question_link": "https://www.google.com/search?hl=en&gl=US&q=Should+Pineapple+be+on+pizza%3F&sa=X&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQzmd6BAgIEA8",
"answer_html": "<div id=\"8\"></div>",
"rank": 2,
"global_rank": 3
},
{
"question": "Is pineapple on pizza a sin?",
"question_link": "https://www.google.com/search?hl=en&gl=US&q=Is+pineapple+on+pizza+a+sin%3F&sa=X&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQzmd6BAgIEBM",
"answer_html": "<div id=\"11\"></div>",
"rank": 3,
"global_rank": 4
},
{
"question": "What does Gordon Ramsay say about pineapple on pizza?",
"question_link": "https://www.google.com/search?hl=en&gl=US&q=What+does+Gordon+Ramsay+say+about+pineapple+on+pizza%3F&sa=X&ved=2ahUKEwiNvt2mtajxAhXGMd8KHaJ0BkgQzmd6BAgIEBc",
"answer_html": "<div id=\"14\"></div>",
"rank": 4,
"global_rank": 5
}
]
}
Social Media API
You want to scrape social networks like Facebook, Twitter, Instagram or LinkedIn? This API endpoint is for you.
This endpoint is different from our other endpoints as this one works in two steps:
- The initial request in POST with the body parameters needed to scrape (scraper + other parameters depending on the selected scraper), this request will return a unique responseId.
- The second request in GET with the scraper and responseId as query parameters, this will either give you the scraping result, tell you that the scraping is not complete yet, or tell you that the scraping failed (In which case credits are refunded).
We needed to implement this solution because scraping social media websites have a lot of protections to avoid bots and being scraped. There is a lot going on on our side in order to avoid being blocked, this makes the scraping process quite longer. This solution felt like the best solution for you and us, as you don’t have your code hanging for minutes while waiting for a request to end.
Request endpoint : POST http://api.scraping-bot.io/scrape/data-scraper
POST request parameters :
scraper: String, the name of the scraper you want to use.
- url: String, the url of the page you wish to extract data from.
- account: String, the account name you want to extract data from.
- search: String, the term or hashtag you wish to extract data from.
- hashtag: String, the hashtag you wish to extract data from.
scraper name | scraper | url | account | search | hashtag |
---|---|---|---|---|---|
linkedinProfile | X | X | |||
linkedinCompanyProfile | X | X | |||
linkedinPost | X | X | |||
instagramProfile | X | X | |||
instagramPost | X | X | |||
instagramHashtag | X | X | |||
facebookProfile | X | X | |||
facebookPost | X | X | |||
facebookOrganization | X | X | |||
twitterProfile | X | X | |||
twitterSearch | X | X | |||
tiktokProfile | X | X | |||
tiktokHashtag | X | X |
Response endpoint : GET http://api.scraping-bot.io/scrape/data-scraper-response?responseId=xxxx&scraper=linkedinCompanyProfile
GET request parameters :
- scraper: same as above, must be matching the one you used to get a specific responseId, required.
- responseId: String, the responseId you received from the POST request response, required.
Here is an example of how you can implement a simple code that handle both request, the second request is repeated every few seconds (considering how long scraping can be for social media, the loop requesting the response can be every 5 seconds or more, there is no point in checking more often) until it’s not pending (either have a result, or an error)
let request = require('request-promise');
let username = "yourUsername",
apiKey = "yourApiKey",
apiEndPoint = "http://api.scraping-bot.io/scrape/data-scraper",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
async function scrape() {
let response = await request(
{
method: 'POST',
url: apiEndPoint,
json: {
scraper: "linkedinCompanyProfile",
url: "https://linkedin.com/company/google",
},
headers: {
Accept: 'application/json',
Authorization : auth
},
}
);
console.log("responseId received : ", response.responseId);
let finalData;
do {
// checking for response data every 5s or more, there is no need to check more often as scraping data from
// social media is quite longer than normal scraping, and we limit how often you can do those check
await sleep(5000);
let responseUrl = "http://api.scraping-bot.io/scrape/data-scraper-response?scraper=linkedinCompanyProfile&responseId=" + response.responseId;
finalData = await request({
method: 'GET',
url: responseUrl,
headers: {
Accept: 'application/json',
Authorization : auth
},
json:true
});
console.log("data recieved : ",finalData);
} while (finalData == null || finalData.status === "pending")
return finalData;
}
scrape().then(result => console.log(result));
let axios = require('axios');
let username = "yourUsername",
apiKey = "yourApiKey",
apiEndPoint = "http://api.scraping-bot.io/scrape/data-scraper",
auth = "Basic " + new Buffer(username + ":" + apiKey).toString("base64");
function sleep(ms) {
return new Promise((resolve) => {
setTimeout(resolve, ms);
});
}
async function scrape() {
try {
let response = await axios.post(apiEndPoint, {
scraper: "linkedinCompanyProfile",
url: "https://linkedin.com/company/google",
}, {
headers: {
Accept: 'application/json',
Authorization: auth
}
});
console.log("responseId received : ", response.data.responseId);
let finalData;
do {
// checking for response data every 5s or more, there is no need to check more often as scraping data from
// social media is quite longer than normal scraping, and we limit how often you can do those check
await sleep(5000);
let responseUrl = "http://api.scraping-bot.io/scrape/data-scraper-response?scraper=linkedinCompanyProfile&responseId=" + response.data.responseId;
finalData = await axios.get(responseUrl,{
headers: {
Accept: 'application/json',
Authorization: auth
}
});
} while (finalData == null || finalData.data.pending === true)
return finalData.data;
} catch (e) {
console.log(e.message)
}
}
scrape().then(result => console.log(result));
import requests
import json
from time import sleep
username = 'yourUsername'
apiKey = 'yourApiKey'
scraper = 'linkedinCompanyProfile'
url = 'https://linkedin.com/company/google'
apiEndPoint = "http://api.scraping-bot.io/scrape/data-scraper"
apiEndPointResponse = "http://api.scraping-bot.io/scrape/data-scraper-response?"
payload = json.dumps({"url": url, "scraper": scraper})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiEndPoint, data=payload, auth=(username, apiKey), headers=headers)
if response.status_code == 200:
print(response.json())
print(response.json()["responseId"])
responseId = response.json()["responseId"]
pending = True
while pending:
# sleep 5s between each loop, social-media scraping can take quite long to complete
# so there is no point calling the api quickly as we will return an error if you do so
sleep(5)
finalResponse = requests.request("GET", apiEndPointResponse + "scraper=" + scraper + "&responseId=" + responseId
, auth=(username, apiKey))
result = finalResponse.json()
if type(result) is list:
pending = False
print(finalResponse.text)
elif type(result) is dict:
if "status" in result and result["status"] == "pending":
print(result["message"])
continue
elif result["error"] is not None:
pending = False
print(json.dumps(result, indent=4))
else:
print(response.text)
import requests
import json
import sys
from time import sleep
username = 'yourUsername'
apiKey = 'yourApiKey'
scraper = 'linkedinCompanyProfile'
urls = ['https://linkedin.com/company/google', "https://www.linkedin.com/company/panorasoft/"]
apiEndPoint = "http://api.scraping-bot.io/scrape/data-scraper"
apiEndPointResponse = "http://api.scraping-bot.io/scrape/data-scraper-response?"
scrapignData = {}
#create scraping data for each URL
for url in urls:
scrapignData[url] = {
'pending' : True,
'responseId' : None,
'response' : None,
'error' : None
}
#for url in urls:
# print("pending : "+str(scrapignData[url]['pending']))
#sys.exit("test")
#ask all response ID
for url in urls:
payload = json.dumps({"url": url, "scraper": scraper})
headers = {
'Content-Type': "application/json"
}
response = requests.request("POST", apiEndPoint, data=payload, auth=(username, apiKey), headers=headers)
if response.status_code == 200:
print(response.json())
print(response.json()["responseId"])
scrapignData[url]['responseId'] = response.json()["responseId"]
else:
scrapignData[url]['error'] = response.text
#all responseId are asked ask now for each result
pendingQueries = len(urls)
while pendingQueries>0:
# sleep 5s between each loop, social-media scraping can take quite long to complete
# so there is no point calling the api quickly as we will return an error if you do so
sleep(5)
#ask each one
for url in urls:
if scrapignData[url]["pending"]:
finalApiCall = apiEndPointResponse + "scraper=" + scraper + "&responseId=" + str(scrapignData[url]["responseId"])
print(finalApiCall)
finalResponse = requests.request("GET", finalApiCall, auth=(username, apiKey))
result = finalResponse.json()
if type(result) is list:
scrapignData[url]["pending"] = False
scrapignData[url]["response"] = finalResponse.text
print(url+" : is Done")
elif type(result) is dict:
if "status" in result and result["status"] == "pending":
print(result["message"])
print(url+" : is still pending")
continue
elif result["error"] is not None:
scrapignData[url]["pending"] = False
scrapignData[url]["error"] = json.dumps(result, indent=4)
print(url+" : got an error")
print(json.dumps(result, indent=4))
if scrapignData[url]["pending"] == False:
pendingQueries -= 1
print("All Scrapings are done !")
print(json.dumps(scrapignData, indent=4))
Here is an example of result you’d receive when using linkedinCompanyProfile scraper as above :
[
{
"url": "https://linkedin.com/company/google",
"name": "Google",
"sphere": "Internet",
"followers": 21781170,
"logo": "https://media-exp3.licdn.com/dms/image/C4D0BAQHiNSL4Or29cg/company-logo_200_200/0/1519856215226?e=2159024400&v=beta&t=r--a5-Dl4gvVE-xIkq8QyBzZ8mQ-OYwBOrixNzR95H0",
"image": "https://media-exp3.licdn.com/dms/image/C4E1BAQH5nC0DmQkbdw/company-background_10000/0/1521522820274?e=2159024400&v=beta&t=wUKk4xOct35R0-E1u0bbRjTFSOhudOdmKqBFuTNK67A",
"employeesAmountInLinkedin": "242238",
"about": "A problem isn't truly solved until it's solved for all. Googlers build products that help create opportunities for everyone, whether down the street or across the globe. Bring your insight, imagination and a healthy disregard for the impossible. Bring everything that makes you unique. Together, we can build for everyone.\n\nCheck out our career opportunities at careers.google.com. ",
"locations": [
"1600 Amphitheatre Parkway, Mountain View, CA 94043, US",
"Unter den Linden 14, Berlin, BE 10117, DE",
"Carrera 11A 94-45, Bogota, Bogota, D.C. 110221, CO",
"Avenida Alicia Moreau de Justo 350, Buenos Aires City, Buenos Aires Autonomous City 1107, AR",
"Erika-Mann-Strasse 33, Munich, BY 80636, DE",
"19510 Jamboree Rd, Irvine, CA 92612, US",
"340 Main St, Los Angeles, CA 90291, US",
"901 Cherry Ave, San Bruno, CA 94066, US",
"345 Spear St, San Francisco, CA 94105, US",
"Montes Urales, Miguel Hidalgo, CDMX 11000, MX",
"2590 Pearl St, Boulder, CO 80302, US",
"Plaza Pablo Ruiz Picasso, Madrid, Community of Madrid 28046, ES",
"Plaza Pablo Ruiz Picasso, Madrid, Community of Madrid 28020, ES",
"Barrow Street, Dublin, County Dublin, IE",
"25 Massachusetts Ave NW, Washington, DC 20001, US",
"St Giles High Street, London, England WC2H 8AG, GB",
"10 10th St NE, Atlanta, GA 30309, US",
"ABC-Strasse 19, Hamburg, HH 20354, DE",
"2 Matheson St, Wan Chai, Hong Kong, HK",
"15, Gurugram, HR 122001, IN",
"8 Rue de Londres, Paris, IdF 75009, FR",
"320 N Morgan St, Chicago, IL 60607, US",
"Old Madras Road, Bengaluru, Karnataka 560016, IN",
"3 Swamy Vivekananda Road, Bengaluru, Karnataka 560016, IN",
"Via Federico Confalonieri, 4, Milan, Lomb. 20124, IT",
"355 Main St, Cambridge, MA 02142, US",
"ulica Emilii Plater 53, Warsaw, MA 00-125, PL",
"3 Bandra Kurla Complex Road, Mumbai, Maharashtra 400051, IN",
"2300 Traverwood Dr, Ann Arbor, MI 48105, US",
"5th Ave, Taguig City, National Capital Region, PH",
"Claude Debussylaan 34, Amsterdam, North Holland 1082 MD, NL",
"48 Pirrama Rd, Sydney, NSW 2009, AU",
"111 8th Ave, New York, NY 10011, US",
"51 Breithaupt St, Kitchener, ON N2H 5G5, CA",
"111 Richmond St W, Toronto, ON M5H 2G4, CA",
"Avenida Costanera Sur, Las Condes, Santiago Metropolitan 7550000, CL",
"3 Pasir Panjang Rd, Singapore, Singapore 118484, SG",
"Avenida Brigadeiro Faria Lima, 3477, Sao Paulo, SP 04538-133, BR",
"Kungsbron 2, Stockholm, Stockholm County 111 22, SE",
"Yigal Allon 98, Tel Aviv-Yafo, Tel Aviv 67891, IL",
"13, Hyderabad, TS 500084, IN",
"9606 N Mopac Expy, Austin, TX 78759, US",
"6175 Main St, Frisco, TX 75034, US",
"1875 Explorer St, Reston, VA 20190, US",
"90 Collins St, Melbourne, VIC 3000, AU",
"777 6th St S, Kirkland, WA 98033, US",
"601 N 34th St, Seattle, WA 98103, US",
"Brandschenkestrasse 110, Zurich, ZH 8002, CH"
],
"employees": [],
"updates": [
{
"title": "Google",
"time": "6d",
"text": "In the new series, Tech Bytes, #WomenTechmakers introduces Black women engineers and developers at Google who offer an expert critical overview of their technical field. Tune in as Erin Relford discusses the importance of staying protected online 🔐➡️ https://goo.gle/3dA6D8J",
"likes_count": "974 Likes",
"comments_count": "42 Comments"
},
{
"title": "Google",
"time": "7d",
"text": "“I was first introduced to computer science when I was 13 because my dad had seen a YouTube video about the importance of coding and the lack of computer science education in schools across the U.S. I was pretty resistant to learning how to code at the time, since I went to a small all-girls school where coding wasn't a super popular course of study. My dad ended up challenging me to see which of us could finish an online Python class fastest, and after a week he had given up on it and I ended up being super interested in the material. I taught myself how to code using online resources throughout middle school, and when I got to high school I was able to take CS classes. Since then, I’ve always known that I want to go into software engineering.” In the latest #MyPathToGoogle, software engineering intern Livia Seibert shares how a fun coding competition with her dad led her to becoming an intern at Google → https://goo.gle/2UOWgHE #GoogleInterns",
"likes_count": "7,451 Likes",
"comments_count": "130 Comments"
},
{
"title": "Google",
"time": "1w",
"text": "When you’re looking for a new role, why not try this trick: keep your old resume next to you for inspiration, but start with a blank document and create a resume specifically designed for each job you want. You can align your skills and experience with the job description and tie your work directly to the role qualifications (and don’t forget to include data). For more resume tips watch the video below 👀⬇️ https://lnkd.in/gE4azAS",
"likes_count": "2,520 Likes",
"comments_count": "82 Comments"
},
{
"title": "Google",
"time": "1w",
"text": "“I jumped at the chance to put some of my ideas into action by joining the Google.org Fellowship program — a program that matches Google employees with organizations in need for up to six months of full-time pro bono work on technical projects.” Googler Suresh Vedula has been volunteering for the last six months as a Google.org Fellow. Learn about his work with Wadhwani AI, a nonprofit developing an AI-based offline app to classify and count local pests in order to make pesticide use more efficient and improve yield for smallholder farmers in India ⬇️ https://goo.gle/3A6Jswz",
"likes_count": "2,694 Likes",
"comments_count": "56 Comments"
},
{
"title": "Google",
"time": "2w",
"text": "This year, we’re supporting LGBTQ+ communities with commitments to help fund and celebrate inclusive spaces that foster belonging for all. Get involved and learn more → http://pride.google",
"likes_count": "6,487 Likes",
"comments_count": "99 Comments"
},
{
"title": "Google",
"time": "2w",
"text": "\"As I progressed in life and my career, I found myself more interested in working in IT. I worked hard to transition from what I thought I wanted to do to where I am now. And I am happy I did – I love the work we do. I have had opportunities to work in different data center locations and in different roles, just by learning new skills and opening myself up to reach out to other site locations and their teams.” In the latest #MyPathToGoogle meet Joy Jackson, a data center technician on the global server operations team, who shares how she went from studying to be a graphic designer to discovering a passion for IT and joining the Google data center team → https://goo.gle/3zTLcZW",
"likes_count": "10,868 Likes",
"comments_count": "181 Comments"
},
{
"title": "Google",
"time": "2w",
"text": "“I joined Google Hyderabad last June and it has been exciting and invigorating to be part of the growing footprint of Google Cloud in India. My role in operations and strategy provides me an opportunity to help chart the growth of our Engineering Development Center spread across Hyderabad and Bangalore, and our engineering teams continue to grow. Our mission is to build a strong and diverse engineering organization based on the best talent in India, with a culture anchored in customer empathy, agility, innovation and inclusion, to help organizations in India and the world accelerate their digital transformation on Google Cloud. I joined Google during the lock-down and working remotely, though seemingly daunting has been extremely hassle free thanks to extremely supportive teams. It feels great to be part of the thriving tech team.” Thank you to Chitra Sood, Product Operations & Strategy Principal, for sharing her experience on the Google Cloud team. Head to our careers site to learn more about opportunities at: Google Hyderabad → https://goo.gle/3uJN8Am Google Bangalore → https://goo.gle/3gQLJob",
"likes_count": "5,743 Likes",
"comments_count": "149 Comments"
},
{
"title": "Google",
"time": "2w",
"text": "“I used to be worried about my evolution at my job in IT Support. I still remained even if I was performing basic repetitive tasks and could not express my passion for innovation and new technologies. I took a chance and applied for a Google conference scholarship, to which I was given the chance to attend the Velocity conference in London. This conference reminded me of how much I love computer science, why I chose to study it, and it inspired me to try to move beyond my comfort zone. Shortly after, I signed up at University to follow my passion to pursue research. Last year, I attended the European Women In Technology conference in Amsterdam. Once again, this was only possible due to Google's scholarship program. I met wonderful fellow women in tech who shared the same passion for innovation as me. I felt part of a community, I had the chance to express myself, alter and enrich my way of thinking and felt very motivated. One year later, I completed my PhD studies. The opportunity to attend these conferences helped me achieve my goals, find the way back to something I really love, and made me get my confidence back.” - Nicoleta Join Nicoleta and apply for a Google conference scholarship. New opportunities added all year round ➡️ https://lnkd.in/eT5-w3q",
"likes_count": "7,358 Likes",
"comments_count": "112 Comments"
}
],
"show_more": [
{
"title": "Amazon",
"subtitle": "Internet",
"location": "Seattle, WA",
"Links": "https://www.linkedin.com/company/amazon?trk=similar-pages"
},
{
"title": "Microsoft",
"subtitle": "Computer Software",
"location": "Redmond, Washington",
"Links": "https://www.linkedin.com/company/microsoft?trk=similar-pages"
},
{
"title": "Apple",
"subtitle": "Consumer Electronics",
"location": "Cupertino, California",
"Links": "https://www.linkedin.com/company/apple?trk=similar-pages"
},
{
"title": "Facebook",
"subtitle": "Internet",
"location": "Menlo Park, CA",
"Links": "https://www.linkedin.com/company/facebook?trk=similar-pages"
},
{
"title": "Netflix",
"subtitle": "Entertainment",
"location": "Los Gatos, CA",
"Links": "https://www.linkedin.com/company/netflix?trk=similar-pages"
},
{
"title": "IBM",
"subtitle": "Information Technology and Services",
"location": "Armonk, New York, NY",
"Links": "https://www.linkedin.com/company/ibm?trk=similar-pages"
},
{
"title": "LinkedIn",
"subtitle": "Internet",
"location": "Sunnyvale, CA",
"Links": "https://www.linkedin.com/company/linkedin?trk=similar-pages"
},
{
"title": "Unilever",
"subtitle": "Consumer Goods",
"location": "Blackfriars, London",
"Links": "https://www.linkedin.com/company/unilever?trk=similar-pages"
},
{
"title": "Tesla",
"subtitle": "Automotive",
"location": "Palo Alto, CA",
"Links": "https://www.linkedin.com/company/tesla-motors?trk=similar-pages"
},
{
"title": "Nestlé",
"subtitle": "Food & Beverages",
"location": null,
"Links": "https://ch.linkedin.com/company/nestle-s-a-?trk=similar-pages"
}
],
"affiliated": [
{
"title": "YouTube",
"subtitle": "Internet",
"location": "San Bruno, CA",
"Links": "https://www.linkedin.com/company/youtube?trk=affiliated-pages"
},
{
"title": "Google Cloud",
"subtitle": "Internet",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-cloud?trk=affiliated-pages"
},
{
"title": "Think with Google",
"subtitle": "Marketing and Advertising",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/think-with-google?trk=affiliated-pages"
},
{
"title": "Google Ads",
"subtitle": "Marketing and Advertising",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-ads-?trk=affiliated-pages"
},
{
"title": "Google Workspace",
"subtitle": "Information Technology and Services",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/googleworkspace?trk=affiliated-pages"
},
{
"title": "Google Analytics products",
"subtitle": "Internet",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-analytics?trk=affiliated-pages"
},
{
"title": "Google Marketing Platform",
"subtitle": "Marketing and Advertising",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/googlemarketingplatform?trk=affiliated-pages"
},
{
"title": "Google Ad Manager",
"subtitle": "Marketing and Advertising",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-ad-manager?trk=affiliated-pages"
},
{
"title": "Grow with Google",
"subtitle": "E-Learning",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/grow-with-google?trk=affiliated-pages"
},
{
"title": "Google for Startups",
"subtitle": "Internet",
"location": "San Francisco, California",
"Links": "https://www.linkedin.com/showcase/google-for-startups?trk=affiliated-pages"
},
{
"title": "X, the moonshot factory",
"subtitle": "Research",
"location": "Mountain View, CA",
"Links": "https://www.linkedin.com/company/x?trk=affiliated-pages"
},
{
"title": "Google Small Business",
"subtitle": "Internet",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-small-business?trk=affiliated-pages"
},
{
"title": "Google Cloud Partners",
"subtitle": "Internet",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-cloud-partners?trk=affiliated-pages"
},
{
"title": "Google Partners",
"subtitle": "Marketing and Advertising",
"location": null,
"Links": "https://www.linkedin.com/showcase/google-partners?trk=affiliated-pages"
},
{
"title": "re:Work with Google",
"subtitle": "Human Resources",
"location": null,
"Links": "https://www.linkedin.com/showcase/rework-with-google?trk=affiliated-pages"
},
{
"title": "Google Play Apps & Games",
"subtitle": "Information Technology and Services",
"location": "Mountain View, CA",
"Links": "https://www.linkedin.com/showcase/googleplaydev?trk=affiliated-pages"
},
{
"title": "Google AdMob",
"subtitle": "Marketing and Advertising",
"location": null,
"Links": "https://www.linkedin.com/showcase/googleadmob?trk=affiliated-pages"
},
{
"title": "Google User Experience Research",
"subtitle": "Internet",
"location": null,
"Links": "https://www.linkedin.com/showcase/google-user-research.?trk=affiliated-pages"
},
{
"title": "Chrome Enterprise",
"subtitle": "Information Technology and Services",
"location": null,
"Links": "https://www.linkedin.com/showcase/chrome-enterprise?trk=affiliated-pages"
},
{
"title": "Google Pay",
"subtitle": "Internet",
"location": "Mountain View, California",
"Links": "https://www.linkedin.com/showcase/google-pay?trk=affiliated-pages"
},
{
"title": "Google News Initiative",
"subtitle": "Online Media",
"location": null,
"Links": "https://www.linkedin.com/showcase/google-news-initiative?trk=affiliated-pages"
},
{
"title": "CapitalG",
"subtitle": "Venture Capital & Private Equity",
"location": "San Francisco, CA",
"Links": "https://ca.linkedin.com/company/capitalg?trk=affiliated-pages"
},
{
"title": "Adometry (acquired by Google)",
"subtitle": "Marketing and Advertising",
"location": "Mountain View, CA",
"Links": "https://www.linkedin.com/company/adometry?trk=affiliated-pages"
},
{
"title": "Android Enterprise",
"subtitle": "Information Technology and Services",
"location": "Mountain View, CA",
"Links": "https://www.linkedin.com/showcase/androidenterprise?trk=affiliated-pages"
},
{
"title": "Android",
"subtitle": "Internet",
"location": null,
"Links": "https://www.linkedin.com/showcase/android_by_google?trk=affiliated-pages"
},
{
"title": "Google Nest Partners",
"subtitle": "Internet",
"location": "London, England",
"Links": "https://www.linkedin.com/showcase/google-nest-partners-uk?trk=affiliated-pages"
},
{
"title": "Rare with Google",
"subtitle": "Marketing and Advertising",
"location": null,
"Links": "https://www.linkedin.com/showcase/rare-with-google?trk=affiliated-pages"
},
{
"title": "Google Chrome",
"subtitle": "Internet",
"location": "Mountain View, CA",
"Links": "https://www.linkedin.com/showcase/google-chrome?trk=affiliated-pages"
}
],
"browse_jobs": [
{
"title": "Manager jobs",
"count": "1,493,779 open jobs",
"Links": "https://www.linkedin.com/jobs/manager-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Intern jobs",
"count": "29,918 open jobs",
"Links": "https://www.linkedin.com/jobs/intern-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Associate jobs",
"count": "957,712 open jobs",
"Links": "https://www.linkedin.com/jobs/associate-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Director jobs",
"count": "336 open jobs",
"Links": "https://www.linkedin.com/jobs/director-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Executive jobs",
"count": "520,180 open jobs",
"Links": "https://www.linkedin.com/jobs/executive-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Assistant jobs",
"count": "604,751 open jobs",
"Links": "https://www.linkedin.com/jobs/assistant-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Consultant jobs",
"count": "627,060 open jobs",
"Links": "https://www.linkedin.com/jobs/consultant-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Specialist jobs",
"count": "638,780 open jobs",
"Links": "https://www.linkedin.com/jobs/specialist-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Product Manager jobs",
"count": "43 open jobs",
"Links": "https://www.linkedin.com/jobs/product-manager-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Graduate jobs",
"count": "306,434 open jobs",
"Links": "https://www.linkedin.com/jobs/graduate-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Administrator jobs",
"count": "390,656 open jobs",
"Links": "https://www.linkedin.com/jobs/administrator-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Assistant Manager jobs",
"count": "294,753 open jobs",
"Links": "https://www.linkedin.com/jobs/assistant-manager-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Accountant jobs",
"count": "70,113 open jobs",
"Links": "https://www.linkedin.com/jobs/accountant-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Team Lead jobs",
"count": "874,470 open jobs",
"Links": "https://www.linkedin.com/jobs/team-lead-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Senior jobs",
"count": "813,491 open jobs",
"Links": "https://www.linkedin.com/jobs/senior-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Designer jobs",
"count": "56,232 open jobs",
"Links": "https://www.linkedin.com/jobs/designer-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Coordinator jobs",
"count": "439,136 open jobs",
"Links": "https://www.linkedin.com/jobs/coordinator-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Student jobs",
"count": "255,690 open jobs",
"Links": "https://www.linkedin.com/jobs/student-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Head jobs",
"count": "803,411 open jobs",
"Links": "https://www.linkedin.com/jobs/head-jobs?trk=organization_guest-browse_jobs"
},
{
"title": "Senior Manager jobs",
"count": "158,440 open jobs",
"Links": "https://www.linkedin.com/jobs/senior-manager-jobs?trk=organization_guest-browse_jobs"
}
],
"company_id": "17876832",
"timestamp": "2021-07-09T09:54:23.082Z",
"slogan": "",
"crunchbase_url": "https://www.crunchbase.com/organization/google",
"stock_info": {
"id": "GOOGL",
"datetime": "July 8, 2021",
"stock_exchange": "NASDAQ",
"delay": "20 minutes delay"
},
"funding": {
"rounds": "4 total rounds",
"last_round_type": "Undisclosed",
"last_round_date": "2000-06-01",
"last_round_raised": "US$ 10.0M"
},
"investors": [
"Yahoo"
],
"Website": "https://goo.gle/3m1IN7m",
"Industries": "Internet",
"Company size": "10,001+ employees",
"Headquarters": "Mountain View, CA",
"Type": "Public Company",
"Specialties": "search, ads, mobile, android, online video, apps, machine learning, virtual reality, cloud, hardware, artificial intelligence, youtube, and software"
}
]
Build a web crawler
In this article, we explain the benefits of a Web Crawler associated to a Scraping API, and the rules to build an efficient one.
Find here a crawler example using ScrapingBot API with only two dependencies: request and cheerio.
You need to use at least nodeJS 8 because of the usage of await/async.
const request = require("request");
const util = require("util");
const rp = util.promisify(request);
const sleep = util.promisify(setTimeout);
const cheerio = require('cheerio');
const { URL } = require('url');
let seenLinks = {};
let rootNode = {};
let currentNode = {};
let linksQueue = [];
let printList = [];
let previousDepth = 0;
let maxCrawlingDepth = 5;
let options = null;
let mainDomain = null;
let mainParsedUrl = null;
class CreateLink {
constructor(linkURL, depth, parent) {
this.url = linkURL;
this.depth = depth;
this.parent = parent;
this.children = [];
}
}
//your scraping bot credentials
let username = "yourUsername",
apiKey = "yourApiKey",
apiEndPoint = "http://api.scraping-bot.io/scrape/raw-html",
auth = "Basic " + Buffer.from(username + ":" + apiKey).toString("base64");
let requestOptions = {
method: 'POST',
url: apiEndPoint,
json: {
url: "this will be replaced in the findLinks function",
//scraing-bot options
options: {
useChrome:false, //if you want to use headless chrome WARNING two api calls wiil be consumed for this option
premiumProxy:false, //if you want to use premium proxies Unblock Amazon,linkedIn (consuming 10 calls)
}
},
headers: {
Accept: 'application/json',
Authorization : auth
}
}
//Start Application put here the adress where you want to start your crawling with
//second parameter is depth with 1 it will scrape all the links found on the first page but not the ones found on other pages
//if you put 2 it will scrape all links on first page and all links found on second level pages be careful with this on a huge website it will represent tons of pages to scrape
// it is recommanded to limit to 5 levels
crawlBFS("https://www.scraping-bot.io/", 1);
async function crawlBFS(startURL, maxDepth = 5) {
try {
mainParsedUrl = new URL(startURL);
} catch (e) {
console.log("URL is not valid", e);
return;
}
mainDomain = mainParsedUrl.hostname;
maxCrawlingDepth = maxDepth;
startLinkObj = new CreateLink(startURL, 0, null);
rootNode = currentNode = startLinkObj;
addToLinkQueue(currentNode);
await findLinks(currentNode);
}
//
async function crawl(linkObj) {
//Add logs here if needed!
//console.log(`Checking URL: ${options.url}`);
await findLinks(linkObj);
}
//The goal is to get the HTML and look for the links inside the page.
async function findLinks(linkObj) {
//lets set the url we wnt to scrape
requestOptions.json.url = linkObj.url
console.log("Scraping URL : " + linkObj.url);
let response
try {
response = await rp(requestOptions);
if (response.statusCode !== 200) {
if (response.statusCode === 401 || response.statusCode === 405) {
console.log("autentication failed check your credentials");
} else {
console.log("an error occurred check the URL" + response.statusCode, response.body);
}
return
}
//response.body is the whole content of the page if you want to store some kind of data from the web page you should do it here
let $ = cheerio.load(response.body);
let links = $('body').find('a').filter(function (i, el) {
return $(this).attr('href') != null;
}).map(function (i, x) {
return $(this).attr('href');
});
if (links.length > 0) {
links.map(function (i, x) {
let reqLink = checkDomain(x);
if (reqLink) {
if (reqLink != linkObj.url) {
newLinkObj = new CreateLink(reqLink, linkObj.depth + 1, linkObj);
addToLinkQueue(newLinkObj);
}
}
});
} else {
console.log("No more links found for " + requestOptions.url);
}
let nextLinkObj = getNextInQueue();
if (nextLinkObj && nextLinkObj.depth <= maxCrawlingDepth) {
//random sleep
//It is very important to make this long enough to avoid spamming the website you want to scrape
//if you choose a short time you will potentially be blocked or kill the website you want to crawl
//time is in milliseconds here
let minimumWaitTime = 500; //half a second these values are very low on a real worl example you should use at least 30000 (30 seconds between each call)
let maximumWaitTime = 5000 //max five seconds
let waitTime = Math.round(minimumWaitTime + (Math.random() * (maximumWaitTime-minimumWaitTime)));
console.log("wait for " + waitTime + " milliseconds");
await sleep(waitTime);
//next url scraping
await crawl(nextLinkObj);
} else {
setRootNode();
printTree();
}
} catch (err) {
console.log("Something Went Wrong...", err);
}
}
//Go all the way up and set RootNode to the parent node
function setRootNode() {
while (currentNode.parent != null) {
currentNode = currentNode.parent;
}
rootNode = currentNode;
}
function printTree() {
addToPrintDFS(rootNode);
console.log(printList.join("\n|"));
}
function addToPrintDFS(node) {
let spaces = Array(node.depth * 3).join("-");
printList.push(spaces + node.url);
if (node.children) {
node.children.map(function (i, x) {
{
addToPrintDFS(i);
}
});
}
}
//Check if the domain belongs to the site being checked
function checkDomain(linkURL) {
let parsedUrl;
let fullUrl = true;
try {
parsedUrl = new URL(linkURL);
} catch (error) {
fullUrl = false;
}
if (fullUrl === false) {
if (linkURL.indexOf("/") === 0) {
//relative to domain url
return mainParsedUrl.protocol + "//" + mainParsedUrl.hostname + linkURL.split("#")[0];
} else if (linkURL.indexOf("#") === 0) {
//anchor avoid link
return
} else {
//relative url
let path = currentNode.url.match('.*\/')[0]
return path + linkURL;
}
}
let mainHostDomain = parsedUrl.hostname;
if (mainDomain == mainHostDomain) {
//console.log("returning Full Link: " + linkURL);
parsedUrl.hash = "";
return parsedUrl.href;
} else {
return;
}
}
function addToLinkQueue(linkobj) {
if (!linkInSeenListExists(linkobj)) {
if (linkobj.parent != null) {
linkobj.parent.children.push(linkobj);
}
linksQueue.push(linkobj);
addToSeen(linkobj);
}
}
function getNextInQueue() {
let nextLink = linksQueue.shift();
if (nextLink && nextLink.depth > previousDepth) {
previousDepth = nextLink.depth;
console.log(`------- CRAWLING ON DEPTH LEVEL ${previousDepth} --------`);
}
return nextLink;
}
function peekInQueue() {
return linksQueue[0];
}
//Adds links we've visited to the seenList
function addToSeen(linkObj) {
seenLinks[linkObj.url] = linkObj;
}
//Returns whether the link has been seen.
function linkInSeenListExists(linkObj) {
return seenLinks[linkObj.url] == null ? false : true;
}
Need to contact us ?
Please fill this form and make your dreams come true !