Libraries & Integrations
libraries & integrations
Python
scrapy-scrapedo
You may prefer to use our Scrapy wrapper library in order to configure it faster and more easily.
Get the library with the following command:
pip3 install scrapy-scrapedoThen you can use the library as follows:
from scrapydo import scrapy, scrapedo
class ScrapedoSampleCrawler(scrapy.Spider):
name = "Scrape-do Sample Crawler"
def __init__(self):
super().__init__(scrapedo.RequestParameters(
token="TOKEN", # Get your Scrape.do token from: dashboard.scrape.do
params={
"geoCode":"us",
"super":False,
"render":True,
"playWithBrowser":[
{"Action":"Click","Selector":"#manpage > div.mp > ul > li:nth-child(3) > a"},
{"Action":"Wait","Timeout":2000},
{"Action":"Execute","Execute":"document.URL"}
],
}))
def start_requests(self):
urls = [
'https://httpbin.co/',
]
for url in urls:
yield self.Request(url=url, callback=self.parse)
def parse(self, response):
print(response.body)
print("target:",self.target_url(response))
Since scrapydo/scrapy is only a wrapper library, you can use all the features of Scrapy with it.
Scrapy
Scrape.do also supports raw Scrapy usage. You can use the following example to utilize Scrapy with Scrape.do.
import scrapy,urllib.parse
class TestSpider(scrapy.Spider):
name = "test"
def update_url(self,url):
return "https://api.scrape.do/?token=YOUR_TOKEN&url="+ urllib.parse.quote(url)
def start_requests(self):
urls = [
'https://example.com',
]
for url in urls:
yield scrapy.Request(url=self.update_url(url), callback=self.parse)
def parse(self, response):
print(response.body)Note that when you use raw Scrapy library, you have to add / after https://api.scrape.do and before ?. Otherwise, you may get 400 Bad Request errors. The final request url should look like "https://api.scrape.do/?token=TOKEN&url=..."
You can use most libraries through Proxy Mode.
import scrapy
class TestSpider(scrapy.Spider):
name = "test"
def start_requests(self):
urls = [
'https://example.com',
]
meta = { "proxy": "http://YOUR_TOKEN:[email protected]:8080" }
for url in urls:
yield scrapy.Request(url=url, callback=self.parse, meta=meta)Python Selenium Example
# pip install selenium-wire
from seleniumwire import webdriver
username = "YOUR_TOKEN"
password = "render=false"
formatted_proxy = f"http://{username}:{password}@proxy.scrape.do:8080"
options = {
"proxy": {
"http": formatted_proxy,
"https": formatted_proxy,
"verify_ssl": False,
},
}
URL = "https://httpbin.co/anything?json"
chrome_options = webdriver.ChromeOptions()
## block images and javascript
chrome_prefs = {
"profile.default_content_setting_values": {
"images": 2,
"javascript": 2,
}
}
chrome_options.experimental_options["prefs"] = chrome_prefs
driver = webdriver.Chrome(
options=chrome_options,
seleniumwire_options=options,
)
driver.get(URL)TypeScript & JavaScript
Puppeteer
const puppeteer = require('puppeteer');
(async() => {
const blockedResourceTypes = [
'beacon',
'csp_report',
'font',
'image',
'imageset',
'media',
'object',
'texttrack',
'stylesheet',
];
const username = "YOUR_TOKEN"
const password = "render=false"
const address = "proxy.scrape.do:8080"
const browser = await puppeteer.launch({
args: [ `--proxy-server=http://${address}` ],
acceptInsecureCerts:true,
headless: false
});
const page = await browser.newPage();
// We suggest you block resources for low concurrency and credit usage
await page.setRequestInterception(true);
page.on('request', request => {
if (blockedResourceTypes.indexOf(request.resourceType()) !== -1) {
console.log(`Blocked type:${request.resourceType()} url:${request.url()}`)
request.abort();
} else {
request.continue();
}
});
await page.authenticate({username, password});
await page.goto('https://www.example.com');
await new Promise(resolve => setTimeout(resolve, 3000));
await browser.close();
})();Playwright (Chrome)
const { chromium } = require('playwright');
(async () => {
const proxy = {
server: 'http://proxy.scrape.do:8080',
username: 'YOUR_TOKEN',
password: '',
};
const browser = await chromium.launch({
headless: false,
proxy: {
server: proxy.server,
username: proxy.username,
password: proxy.password,
},
args: ['--ignore-certificate-errors']
});
const page = await browser.newPage();
await page.goto('https://www.example.com/');
await page.waitForTimeout(3000);
await browser.close();
})();
