使用 Crawlee 爬取新聞網站

Memo's Blog

2023-12-29

Node.js

JavaScript, Node.js, Programming, Web Scraping

前言

以爬取「自由時報電子報」為例，利用指定特定關鍵詞和時間區間的方式，可以把每天的新聞資料爬取下來。

建立專案

建立專案，選擇以 Puppeteer 作為驅動。

1
2
3

npx crawlee create crawlee-example

> PuppeteerCrawler template project [JavaScript]

安裝依賴套件。

1	npm i @apify/log commander dayjs dotenv

實作

建立 src/ltn/index.js 檔。

import log from '@apify/log';
import { PuppeteerCrawler } from 'crawlee';
import dayjs from 'dayjs';
import minMax from 'dayjs/plugin/minMax.js';
import { BASE_URL, DATE_FORMAT } from './constant/index.js';
import createRouter from './router/index.js';

dayjs.extend(minMax);

class LTNCrawler {
  constructor(options) {
    this.crawler = new PuppeteerCrawler({
      requestHandler: createRouter(), // 指定處理器
      ...options,
    });
  }

  async run({
    keyword,
    from,
    to,
  }) {
    const start = dayjs(from);
    const end = dayjs.min(dayjs(to), dayjs());
    const range = end.diff(start, 'day');

    // 以天為單位執行爬蟲
    for (let i = 0; i < range; i += 1) {
      const from = start.add(i, 'day').format(DATE_FORMAT);
      const to = start.add(i + 1, 'day').format(DATE_FORMAT);
      log.info(`Preparing the crawler for the keyword "${keyword}" from ${from} to ${to}`);
      await this.start({
        keyword,
        startDate: from,
        endDate: to,
      });
    }
  }

  start({
    keyword,
    startDate,
    endDate,
  }) {
    // 建立網址
    const params = {
      keyword,
      start_time: dayjs(startDate).format(DATE_FORMAT),
      end_time: dayjs(endDate).format(DATE_FORMAT),
      sort: 'date',
      type: 'all',
      page: '1',
    };

    const url = new URL(BASE_URL);
    Object.entries(params).forEach(([key, value]) => {
      url.searchParams.set(key, value);
    });

    return this.crawler.run([
      url.toString(),
    ]);
  }
}

export default LTNCrawler;

常數

建立 src/ltn/constant/index.js 檔。

export const BASE_URL = 'https://search.ltn.com.tw/list';
export const DATE_FORMAT = 'YYYYMMDD';

export const LABEL_LIST = 'list';
export const LABEL_ITEM = 'item';

路由

建立 src/ltn/router/index.js 檔。

import { createPuppeteerRouter } from 'crawlee';
import { LABEL_ITEM, LABEL_LIST } from '../constant/index.js';
import fetchItem from '../handlers/fetch-item.js';
import fetchList from '../handlers/fetch-list.js';

const createRouter = () => {
  const router = createPuppeteerRouter();

  // 預設處理器
  router.addDefaultHandler(fetchList);
  // 列表頁面處理器
  router.addHandler(LABEL_LIST, fetchList);
  // 詳細頁面處理器
  router.addHandler(LABEL_ITEM, fetchItem);

  return router;
};

export default createRouter;

處理器

建立 src/ltn/handlers/fetch-list.js 檔。

import { LABEL_ITEM, LABEL_LIST } from '../constant/index.js';

const fetchList = async ({ page, enqueueLinks, log }) => {
  // 找到新聞標題，將網址存入佇列，標記為詳細頁面
  const links = await enqueueLinks({
    label: LABEL_ITEM,
    strategy: 'same-domain',
    selector: '.tit',
  });
  log.debug('Enqueueing links', links);

  // 找到下一頁的按鈕
  const next = await page.$('.p_next');
  if (next) {
    // 找到下一頁的網址，將網址存入佇列，標記為列表頁面
    const links = await enqueueLinks({
      label: LABEL_LIST,
      strategy: 'same-domain',
      selector: '.p_next',
    });
    log.debug('Enqueueing links', links);
  }
};

export default fetchList;

建立 src/ltn/handlers/fetch-item.js 檔。

import { Dataset } from 'crawlee';
import dayjs from 'dayjs';
import { DATE_FORMAT } from '../constant/index.js';

const fetchItem = async ({ request, page, log }) => {
  // 新聞標題
  const title = await page.title();
  // 新聞日期
  const timestamp = await page.$eval('meta[property="article:published_time"]', ({ content }) => content);
  // 新聞內文
  const paragraphs = await page.$$eval('.content p:not([id]):not([class])', ($elements) => (
    $elements
      .map(({ innerText }) => innerText)
      .filter((v) => v)
  ));

  const data = {
    title,
    date: timestamp,
    url: request.loadedUrl,
    data: paragraphs,
  };

  const id = dayjs(timestamp).format(DATE_FORMAT);
  const dataset = await Dataset.open(id);
  dataset.pushData(data);

  log.info(title, {
    date: data.date,
    url: data.url,
  });
};

export default fetchItem;

啟動檔

建立 src/main.js 檔。

import { Command } from 'commander';
import 'dotenv/config';
import LTNCrawler from './ltn/index.js';

const program = new Command();

program
  .requiredOption('-k, --keyword <keyword>', 'Specify the search keyword')
  .requiredOption('-f, --from <from>', 'Specify the start date (YYYY-MM-DD)')
  .requiredOption('-t, --to <to>', 'Specify the end date (YYYY-MM-DD)')
  .parse(process.argv);

const { keyword, from, to } = program.opts();

// 初始化爬蟲
const ltnCrawler = new LTNCrawler({
  maxRequestsPerCrawl: 10000,
  maxConcurrency: 10,
});

// 啟動爬蟲
ltnCrawler.run({
  keyword,
  from,
  to,
});

使用

執行爬蟲。

1	npm start -- --keyword 賴清德 --from 2023-01-01 --to 2023-01-31

程式碼

news-crawler