generated from tailored/router-template
Initial commit
This commit is contained in:
308
src/crawl/nikkei/constants.ts
Normal file
308
src/crawl/nikkei/constants.ts
Normal file
@@ -0,0 +1,308 @@
|
||||
import { fileIsExist } from '@kevisual/use-config';
|
||||
import { useFileStore } from '@kevisual/use-config/file-store';
|
||||
import fs from 'fs';
|
||||
import path from 'path';
|
||||
|
||||
export const baseUrl = 'https://cn.nikkei.com';
|
||||
|
||||
export const taskName = 'nikkei';
|
||||
export const tabs = [
|
||||
{
|
||||
name: '日经精选',
|
||||
href: '/',
|
||||
children: [
|
||||
{
|
||||
href: '/top/nian-du-pan-dian-zhan-wang.html',
|
||||
text: '年度盘点展望',
|
||||
},
|
||||
{
|
||||
href: '/top/2021-04-20-01-47-39.html',
|
||||
text: '日本企业研究',
|
||||
},
|
||||
{
|
||||
href: '/top/2020-08-25-06-34-55.html?types[0]=8',
|
||||
text: '半导体/AI',
|
||||
},
|
||||
{
|
||||
href: '/top/2019-08-29-06-18-57.html',
|
||||
text: '中日深度观察',
|
||||
},
|
||||
{
|
||||
href: '/top/201604-3.html',
|
||||
text: '日本游',
|
||||
},
|
||||
{
|
||||
href: '/top/2021-03-03-07-02-53.html?types[0]=8',
|
||||
text: '脱碳经济',
|
||||
},
|
||||
{
|
||||
href: '/top/bp.html?types[0]=8',
|
||||
text: '日经BP精选',
|
||||
},
|
||||
{
|
||||
href: '/top/ft.html?types[0]=8',
|
||||
text: 'FT中文网精选',
|
||||
},
|
||||
{
|
||||
href: '/top/foa2024.html?types[0]=8',
|
||||
text: '亚洲的未来',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '中国',
|
||||
href: '/china.html',
|
||||
children: [
|
||||
{
|
||||
href: '/china/ceconomy.html',
|
||||
text: '经济',
|
||||
},
|
||||
{
|
||||
href: '/china/ccompany.html',
|
||||
text: '企业',
|
||||
},
|
||||
{
|
||||
href: '/china/cfinancial.html',
|
||||
text: '金融市场',
|
||||
},
|
||||
{
|
||||
href: '/china/cpolicssociety.html',
|
||||
text: '政治/社会',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '政经观察',
|
||||
href: '/politicsaeconomy.html',
|
||||
children: [
|
||||
{
|
||||
href: '/politicsaeconomy/epolitics.html',
|
||||
text: '宏观经济',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/economic-policy.html',
|
||||
text: '经济政策',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/stockforex.html',
|
||||
text: '股市/外汇',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/investtrade.html',
|
||||
text: '投资/贸易',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/efinance.html',
|
||||
text: '金融',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/commodity.html',
|
||||
text: '大宗商品',
|
||||
},
|
||||
{
|
||||
href: '/politicsaeconomy/politicsasociety.html',
|
||||
text: '政治/社会',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '产业聚焦',
|
||||
href: '/industry.html',
|
||||
children: [
|
||||
{
|
||||
href: '/industry/icar.html',
|
||||
text: '汽车',
|
||||
},
|
||||
{
|
||||
href: '/industry/itelectric-appliance.html',
|
||||
text: 'IT/家电',
|
||||
},
|
||||
{
|
||||
href: '/industry/ienvironment.html',
|
||||
text: '环境/能源',
|
||||
},
|
||||
{
|
||||
href: '/industry/manufacturing.html',
|
||||
text: '工业',
|
||||
},
|
||||
{
|
||||
href: '/industry/agriculture.html',
|
||||
text: '农林水产',
|
||||
},
|
||||
{
|
||||
href: '/industry/propertiesconstruction.html',
|
||||
text: '地产/建设',
|
||||
},
|
||||
{
|
||||
href: '/industry/tradingretail.html',
|
||||
text: '商业/消费',
|
||||
},
|
||||
{
|
||||
href: '/industry/scienceatechnology.html',
|
||||
text: '科学/技术',
|
||||
},
|
||||
{
|
||||
href: '/industry/management-strategy.html',
|
||||
text: '经营/战略',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '新产品',
|
||||
href: '/product.html',
|
||||
children: [
|
||||
{
|
||||
href: '/product/pdigital.html',
|
||||
text: '数码与家电',
|
||||
},
|
||||
{
|
||||
href: '/product/automobile.html',
|
||||
text: '汽车',
|
||||
},
|
||||
{
|
||||
href: '/product/beautyahealth.html',
|
||||
text: '美容与健康',
|
||||
},
|
||||
{
|
||||
href: '/product/prime-goods.html',
|
||||
text: '美品精选',
|
||||
},
|
||||
{
|
||||
href: '/product/joke-goods.html',
|
||||
text: '非凡创意',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '穿梭日本',
|
||||
href: '/trend.html',
|
||||
children: [
|
||||
{
|
||||
href: '/trend/cool-japan.html',
|
||||
text: '酷日本',
|
||||
},
|
||||
{
|
||||
href: '/trend/beautyahealth.html',
|
||||
text: '美容健身',
|
||||
},
|
||||
{
|
||||
href: '/trend/traditional-culture.html',
|
||||
text: '文化精粹',
|
||||
},
|
||||
{
|
||||
href: '/trend/tourism.html',
|
||||
text: '日本逍遥行',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '专栏/观点',
|
||||
href: '/columnviewpoint.html',
|
||||
children: [
|
||||
{
|
||||
href: '/columnviewpoint/tearoom.html',
|
||||
text: '中日茶坊',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/columns-b.html',
|
||||
text: '肖敏捷论中日',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/column-special1.html',
|
||||
text: '日本人小声说',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/liudicolumn.html',
|
||||
text: '刘迪观察',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/zhangshicolumn.html',
|
||||
text: '张石的樱雪鸿泥',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/kelongcolumn.html',
|
||||
text: '老柯要说话',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/criticism.html',
|
||||
text: '社评',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/viewpoint.html',
|
||||
text: '观点',
|
||||
},
|
||||
{
|
||||
href: '/columnviewpoint/column.html',
|
||||
text: '专栏',
|
||||
},
|
||||
],
|
||||
},
|
||||
{
|
||||
name: '职场/深造',
|
||||
href: '/career.html',
|
||||
children: [
|
||||
{
|
||||
href: '/career/humanresource.html',
|
||||
text: '人才活用',
|
||||
},
|
||||
{
|
||||
href: '/career/employment.html',
|
||||
text: '就业',
|
||||
},
|
||||
{
|
||||
href: '/career/abroadstudy.html',
|
||||
text: '留学/教育',
|
||||
},
|
||||
],
|
||||
},
|
||||
];
|
||||
|
||||
export const allTab = tabs.flatMap((tab) => {
|
||||
return tab.children.map((child) => {
|
||||
return {
|
||||
parentName: tab.name,
|
||||
parentHref: tab.href,
|
||||
...child,
|
||||
};
|
||||
});
|
||||
});
|
||||
|
||||
export const exampleTab = {
|
||||
parentName: '日经精选',
|
||||
parentHref: '/',
|
||||
href: '/top/bp.html?types[0]=8',
|
||||
text: '日经BP精选',
|
||||
};
|
||||
|
||||
export const crawlConfigPathRawDir = `./config/${taskName}`;
|
||||
export const crawlDataPathDir = `./data/${taskName}`;
|
||||
export const crawlConfigPathRaw = path.join(crawlConfigPathRawDir, 'crawl-config.json');
|
||||
export const crawlDataPath = useFileStore(crawlDataPathDir, {
|
||||
needExists: true,
|
||||
});
|
||||
|
||||
export const crawlConfigPath = useFileStore(crawlConfigPathRawDir, {
|
||||
needExists: true,
|
||||
});
|
||||
|
||||
export const getCrawlConfig = () => {
|
||||
try {
|
||||
const config = fs.readFileSync(crawlConfigPathRaw, 'utf-8');
|
||||
|
||||
return JSON.parse(config);
|
||||
} catch (error) {
|
||||
setCrawlConfig({
|
||||
currentTabIndex: 0,
|
||||
});
|
||||
return {
|
||||
currentTabIndex: 0,
|
||||
};
|
||||
}
|
||||
};
|
||||
|
||||
export const setCrawlConfig = (config: any) => {
|
||||
if (!fileIsExist(crawlConfigPathRawDir)) {
|
||||
fs.mkdirSync(crawlConfigPathRawDir, { recursive: true });
|
||||
}
|
||||
fs.writeFileSync(crawlConfigPathRaw, JSON.stringify(config, null, 2));
|
||||
};
|
||||
223
src/crawl/nikkei/crawal-tab.ts
Normal file
223
src/crawl/nikkei/crawal-tab.ts
Normal file
@@ -0,0 +1,223 @@
|
||||
import * as cheerio from 'cheerio';
|
||||
import * as puppeteer from 'puppeteer';
|
||||
import fs from 'fs';
|
||||
|
||||
import { allTab, baseUrl, exampleTab, taskName } from './constants.ts';
|
||||
import { getCrawlConfig, setCrawlConfig } from './constants.ts';
|
||||
// Import the errors object from puppeteer
|
||||
|
||||
const crawlConfig = getCrawlConfig();
|
||||
export const sleep = (ms: number) => new Promise((resolve) => setTimeout(resolve, ms));
|
||||
let errorCount = 5;
|
||||
let browser: puppeteer.Browser;
|
||||
let currentTabIndex = crawlConfig.currentTabIndex || 0;
|
||||
|
||||
const initializeBrowser = async () => {
|
||||
if (!browser) {
|
||||
browser = await puppeteer.launch({
|
||||
headless: true, // 继续使用无头模式
|
||||
args: [
|
||||
'--no-sandbox',
|
||||
'--disable-setuid-sandbox',
|
||||
'--disable-blink-features=AutomationControlled', // 禁用自动化控制标志
|
||||
],
|
||||
ignoreDefaultArgs: ['--enable-automation'],
|
||||
devtools: false,
|
||||
defaultViewport: {
|
||||
width: 1280,
|
||||
height: 720,
|
||||
},
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
const closeBrowser = async () => {
|
||||
if (browser) {
|
||||
await browser.close();
|
||||
}
|
||||
};
|
||||
|
||||
const getPageContent = async (link: string, retryCount = 3) => {
|
||||
if (!browser) {
|
||||
await initializeBrowser();
|
||||
}
|
||||
let page = await browser.newPage();
|
||||
// 设置用户代理和语言,模拟常规浏览器
|
||||
await page.setUserAgent('Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36');
|
||||
await page.setExtraHTTPHeaders({
|
||||
'Accept-Language': 'en-US,en;q=0.9',
|
||||
});
|
||||
console.log('start crawl tab', currentTabIndex, link);
|
||||
try {
|
||||
await page.goto(link, { timeout: 120000 }); // 将超时时间增加到120秒
|
||||
} catch (error) {
|
||||
if (error instanceof puppeteer.TimeoutError) {
|
||||
console.log('Timeout error when accessing:', link);
|
||||
await page.close();
|
||||
if (retryCount > 0) {
|
||||
console.log('Retrying...', retryCount, 'attempts left');
|
||||
await sleep(10000); // 等待10秒
|
||||
await browser.close();
|
||||
browser = null;
|
||||
await sleep(10000); // 等待10秒
|
||||
return await getPageContent(link, retryCount - 1); // Retry with decremented retryCount
|
||||
} else {
|
||||
console.log('Skipping after 3 retries:', link);
|
||||
return null; // Return null or handle as needed when retries are exhausted
|
||||
}
|
||||
} else {
|
||||
console.log('Non-timeout error:', error.message);
|
||||
await page.close();
|
||||
throw error; // Re-throw non-timeout errors to handle them in the calling function
|
||||
}
|
||||
}
|
||||
console.log('end crawl tab', currentTabIndex, link);
|
||||
const html = await page.content();
|
||||
await page.close();
|
||||
return html;
|
||||
};
|
||||
|
||||
export const crawlTab = async (link: string, deep = true): Promise<LinkTitleTimeCrawl[]> => {
|
||||
try {
|
||||
const html = await getPageContent(link);
|
||||
const $ = cheerio.load(html);
|
||||
// class为style01 mB10的div
|
||||
const title = $('.style01.mB10').text();
|
||||
// 获取id为contentDiv的所有的文本内容
|
||||
const contentDiv = $('#contentDiv');
|
||||
// contentDiv 下面的class为newsText的内容
|
||||
const content = contentDiv.find('.newsText').text();
|
||||
// class为pagenavbar的是分页导航
|
||||
const pageNavbar = $('.pagenavbar');
|
||||
const aList = pageNavbar.find('a');
|
||||
const aListArray = aList.toArray();
|
||||
let aListFullLink = aListArray
|
||||
.map((item) => {
|
||||
const a = $(item);
|
||||
const href = a.attr('href');
|
||||
return getFullLink(href);
|
||||
})
|
||||
.filter((item) => {
|
||||
// 过滤 ?start=0 和 当前的链接
|
||||
return !item.includes('?start=0') && item !== link;
|
||||
});
|
||||
// 去重复
|
||||
let page = [];
|
||||
if (deep && aListFullLink.length > 0) {
|
||||
aListFullLink = [...new Set(aListFullLink)];
|
||||
console.log('aListFullLink', aListFullLink);
|
||||
for (const item of aListFullLink) {
|
||||
const one = await crawlTab(item, false);
|
||||
page.push(...one);
|
||||
}
|
||||
page = page.flat();
|
||||
return [...page, { title, content, link: link, deep: deep }];
|
||||
}
|
||||
errorCount = 0;
|
||||
return [{ title, content, link: link, deep: deep }, ...page];
|
||||
} catch (e) {
|
||||
// 如果是超时错误
|
||||
if (e instanceof puppeteer.TimeoutError) {
|
||||
console.error('crawlTab error', 'currentTabIndex', currentTabIndex, 'link', link, 'error', e.message);
|
||||
await closeBrowser();
|
||||
console.error('close browser', 'errorCount', errorCount);
|
||||
// 重试
|
||||
errorCount++;
|
||||
if (errorCount > 3) {
|
||||
console.error('crawlTab error and return empty', 'currentTabIndex', currentTabIndex, link);
|
||||
return [];
|
||||
}
|
||||
await sleep(3000);
|
||||
return await crawlTab(link, deep);
|
||||
} else {
|
||||
console.error('crawlTab other error', 'currentTabIndex', currentTabIndex, 'link', link, 'error', e.message);
|
||||
return [];
|
||||
}
|
||||
}
|
||||
};
|
||||
export const getFullLink = (link: string) => {
|
||||
return `${baseUrl}${link}`;
|
||||
};
|
||||
type LinkTitleTime = {
|
||||
link: string;
|
||||
fullLink?: string;
|
||||
title: string;
|
||||
time: string;
|
||||
crawlTime: string;
|
||||
crawl?: { title: string; content: string; link: string; page: string; deep: boolean }[];
|
||||
};
|
||||
type LinkTitleTimeCrawl = { title: string; content: string; link: string; page: string; deep: boolean };
|
||||
export const crawlTabList = async (link: string, fileName?: string) => {
|
||||
await initializeBrowser();
|
||||
|
||||
const html = await getPageContent(link);
|
||||
const $ = cheerio.load(html);
|
||||
// 获取id为contentDiv的所有的文本内容
|
||||
const title = $('h1').text();
|
||||
const newsContent02 = $('.newsContent02');
|
||||
const linkTitleTimeList: LinkTitleTime[] = [];
|
||||
const nowTime = new Date().getTime();
|
||||
newsContent02.find('dt').each((index, element) => {
|
||||
const link = $(element).find('a').attr('href');
|
||||
const title = $(element).find('a').text();
|
||||
let time = $(element).find('span').text();
|
||||
console.log('Link:', link);
|
||||
console.log('Title:', title);
|
||||
console.log('Time:', time);
|
||||
// time包含的时候,去掉(和)
|
||||
time = time.replace(/\(/, '');
|
||||
time = time.replace(/\)/, '');
|
||||
const createTime = new Date(time).getTime();
|
||||
// 当前的时间和createTime的差值,有效期在二周内
|
||||
const diffTime = nowTime - createTime;
|
||||
if (diffTime > 14 * 24 * 60 * 60 * 1000) {
|
||||
console.log('time is out of date', time);
|
||||
return;
|
||||
}
|
||||
linkTitleTimeList.push({ link, title, fullLink: getFullLink(link), time, crawlTime: new Date().toISOString() });
|
||||
});
|
||||
|
||||
// console.log('linkTitleTimeList', linkTitleTimeList, linkTitleTimeList.length);
|
||||
linkTitleTimeList.length && console.log('current linkTitleTimeList', linkTitleTimeList.length);
|
||||
for (const item of linkTitleTimeList) {
|
||||
const fullLink = getFullLink(item.link);
|
||||
//
|
||||
item.crawl = await crawlTab(fullLink, true);
|
||||
}
|
||||
fileName = fileName || 'a.json';
|
||||
const filePath = `./data/${taskName}/${fileName}`;
|
||||
fs.writeFileSync(filePath, JSON.stringify(linkTitleTimeList, null, 2));
|
||||
return title;
|
||||
};
|
||||
|
||||
const tabExampleUrl = `${baseUrl}${exampleTab.href}`;
|
||||
|
||||
const startTime = new Date().getTime();
|
||||
const crawlAllTab = async () => {
|
||||
await initializeBrowser();
|
||||
console.time(`${taskName} crawlAllTab`);
|
||||
for (let i = currentTabIndex; i < allTab.length; i++) {
|
||||
const tab = allTab[i];
|
||||
const link = `${baseUrl}${tab.href}`;
|
||||
console.log('crawl tab', tab.parentName, tab.text, link);
|
||||
try {
|
||||
await crawlTabList(link, `${currentTabIndex}.json`);
|
||||
} catch (error) {
|
||||
console.log('crawl tab error', link, error);
|
||||
}
|
||||
const diffTime = (new Date().getTime() - startTime) / 1000;
|
||||
console.log('crawl tab end', tab.parentName, tab.text, link, 'run-time:', diffTime, 's\n\n');
|
||||
currentTabIndex++;
|
||||
setCrawlConfig({
|
||||
currentTabIndex,
|
||||
});
|
||||
}
|
||||
await closeBrowser();
|
||||
setCrawlConfig({
|
||||
currentTabIndex: 0,
|
||||
});
|
||||
console.timeEnd(`${taskName} crawlAllTab`);
|
||||
};
|
||||
|
||||
crawlAllTab();
|
||||
// crawlTabList(tabExampleUrl, 'a.json');
|
||||
57
src/crawl/nikkei/find/find-list-cheerio.ts
Normal file
57
src/crawl/nikkei/find/find-list-cheerio.ts
Normal file
File diff suppressed because one or more lines are too long
60
src/crawl/nikkei/find/find-list.ts
Normal file
60
src/crawl/nikkei/find/find-list.ts
Normal file
File diff suppressed because one or more lines are too long
111
src/crawl/nikkei/find/tab.html
Normal file
111
src/crawl/nikkei/find/tab.html
Normal file
@@ -0,0 +1,111 @@
|
||||
console.log(name);
|
||||
<div class="banner">
|
||||
<ul class="bannerList fix" id="bannerList">
|
||||
<li class="bannerCho on">
|
||||
<h2><a href="/" target="_blank">日经精选</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/top/nian-du-pan-dian-zhan-wang.html" target="_blank">年度盘点展望</a></li>
|
||||
<li><a href="/top/2021-04-20-01-47-39.html" target="_blank">日本企业研究</a></li>
|
||||
<li><a href="/top/2020-08-25-06-34-55.html?types[0]=8" target="_blank">半导体/AI</a></li>
|
||||
<li><a href="/top/2019-08-29-06-18-57.html" target="_blank">中日深度观察</a></li>
|
||||
<li><a href="/top/201604-3.html" target="_blank">日本游</a></li>
|
||||
<li><a href="/top/2021-03-03-07-02-53.html?types[0]=8" target="_blank">脱碳经济</a></li>
|
||||
<li><a href="/top/bp.html?types[0]=8" target="_blank">日经BP精选</a></li>
|
||||
<li><a href="/top/ft.html?types[0]=8" target="_blank">FT中文网精选</a></li>
|
||||
<li class="end"><a href="/top/foa2024.html?types[0]=8" target="_blank">亚洲的未来</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerChi">
|
||||
<h2><a href="/china.html" target="_blank">中国</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/china/ceconomy.html" target="_blank">经济</a></li>
|
||||
<li><a href="/china/ccompany.html" target="_blank">企业</a></li>
|
||||
<li><a href="/china/cfinancial.html" target="_blank">金融市场</a></li>
|
||||
<li class="end"><a href="/china/cpolicssociety.html" target="_blank">政治/社会</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerObs">
|
||||
<h2><a href="/politicsaeconomy.html" target="_blank">政经观察</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/politicsaeconomy/epolitics.html" target="_blank">宏观经济</a></li>
|
||||
<li><a href="/politicsaeconomy/economic-policy.html" target="_blank">经济政策</a></li>
|
||||
<li><a href="/politicsaeconomy/stockforex.html" target="_blank">股市/外汇</a></li>
|
||||
<li><a href="/politicsaeconomy/investtrade.html" target="_blank">投资/贸易</a></li>
|
||||
<li><a href="/politicsaeconomy/efinance.html" target="_blank">金融</a></li>
|
||||
<li><a href="/politicsaeconomy/commodity.html" target="_blank">大宗商品</a></li>
|
||||
<li class="end"><a href="/politicsaeconomy/politicsasociety.html" target="_blank">政治/社会</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerInd">
|
||||
<h2><a href="/industry.html" target="_blank">产业聚焦</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/industry/icar.html" target="_blank">汽车</a></li>
|
||||
<li><a href="/industry/itelectric-appliance.html" target="_blank">IT/家电</a></li>
|
||||
<li><a href="/industry/ienvironment.html" target="_blank">环境/能源</a></li>
|
||||
<li><a href="/industry/manufacturing.html" target="_blank">工业</a></li>
|
||||
<li><a href="/industry/agriculture.html" target="_blank">农林水产</a></li>
|
||||
<li><a href="/industry/propertiesconstruction.html" target="_blank">地产/建设</a></li>
|
||||
<li><a href="/industry/tradingretail.html" target="_blank">商业/消费</a></li>
|
||||
<li><a href="/industry/scienceatechnology.html" target="_blank">科学/技术</a></li>
|
||||
<li class="end"><a href="/industry/management-strategy.html" target="_blank">经营/战略</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerNpr">
|
||||
<h2><a href="/product.html" target="_blank">新产品</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/product/pdigital.html" target="_blank">数码与家电</a></li>
|
||||
<li><a href="/product/automobile.html" target="_blank">汽车</a></li>
|
||||
<li><a href="/product/beautyahealth.html" target="_blank">美容与健康</a></li>
|
||||
<li><a href="/product/prime-goods.html" target="_blank">美品精选</a></li>
|
||||
<li class="end"><a href="/product/joke-goods.html" target="_blank">非凡创意</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerCjp">
|
||||
<h2><a href="/trend.html" target="_blank">穿梭日本</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/trend/cool-japan.html" target="_blank">酷日本</a></li>
|
||||
<li><a href="/trend/beautyahealth.html" target="_blank">美容健身</a></li>
|
||||
<li><a href="/trend/traditional-culture.html" target="_blank">文化精粹</a></li>
|
||||
<li class="end"><a href="/trend/tourism.html" target="_blank">日本逍遥行</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerPoi">
|
||||
<h2><a href="/columnviewpoint.html" target="_blank">专栏/观点</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/columnviewpoint/tearoom.html" target="_blank">中日茶坊</a></li>
|
||||
<li><a href="/columnviewpoint/columns-b.html" target="_blank">肖敏捷论中日</a></li>
|
||||
<li><a href="/columnviewpoint/column-special1.html" target="_blank">日本人小声说</a></li>
|
||||
<li><a href="/columnviewpoint/liudicolumn.html" target="_blank">刘迪观察</a></li>
|
||||
<li><a href="/columnviewpoint/zhangshicolumn.html" target="_blank">张石的樱雪鸿泥</a></li>
|
||||
<li><a href="/columnviewpoint/kelongcolumn.html" target="_blank">老柯要说话</a></li>
|
||||
<li><a href="/columnviewpoint/criticism.html" target="_blank">社评</a></li>
|
||||
<li><a href="/columnviewpoint/viewpoint.html" target="_blank">观点</a></li>
|
||||
<li class="end"><a href="/columnviewpoint/column.html" target="_blank">专栏</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
<li class="bannerJob">
|
||||
<h2><a href="/career.html" target="_blank">职场/深造</a></h2>
|
||||
<div>
|
||||
<ul class="fix" style="width:980">
|
||||
<li><a href="/career/humanresource.html" target="_blank">人才活用</a></li>
|
||||
<li><a href="/career/employment.html" target="_blank">就业</a></li>
|
||||
<li class="end"><a href="/career/abroadstudy.html" target="_blank">留学/教育</a></li>
|
||||
</ul>
|
||||
</div>
|
||||
</li>
|
||||
</ul>
|
||||
</div>ƒ
|
||||
302
src/crawl/nikkei/find/tabs.json
Normal file
302
src/crawl/nikkei/find/tabs.json
Normal file
@@ -0,0 +1,302 @@
|
||||
[
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/nian-du-pan-dian-zhan-wang.html",
|
||||
"text": "年度盘点展望"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/2021-04-20-01-47-39.html",
|
||||
"text": "日本企业研究"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/2020-08-25-06-34-55.html?types[0]=8",
|
||||
"text": "半导体/AI"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/2019-08-29-06-18-57.html",
|
||||
"text": "中日深度观察"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/201604-3.html",
|
||||
"text": "日本游"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/2021-03-03-07-02-53.html?types[0]=8",
|
||||
"text": "脱碳经济"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/bp.html?types[0]=8",
|
||||
"text": "日经BP精选"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/ft.html?types[0]=8",
|
||||
"text": "FT中文网精选"
|
||||
},
|
||||
{
|
||||
"parentName": "日经精选",
|
||||
"parentHref": "/",
|
||||
"href": "/top/foa2024.html?types[0]=8",
|
||||
"text": "亚洲的未来"
|
||||
},
|
||||
{
|
||||
"parentName": "中国",
|
||||
"parentHref": "/china.html",
|
||||
"href": "/china/ceconomy.html",
|
||||
"text": "经济"
|
||||
},
|
||||
{
|
||||
"parentName": "中国",
|
||||
"parentHref": "/china.html",
|
||||
"href": "/china/ccompany.html",
|
||||
"text": "企业"
|
||||
},
|
||||
{
|
||||
"parentName": "中国",
|
||||
"parentHref": "/china.html",
|
||||
"href": "/china/cfinancial.html",
|
||||
"text": "金融市场"
|
||||
},
|
||||
{
|
||||
"parentName": "中国",
|
||||
"parentHref": "/china.html",
|
||||
"href": "/china/cpolicssociety.html",
|
||||
"text": "政治/社会"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/epolitics.html",
|
||||
"text": "宏观经济"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/economic-policy.html",
|
||||
"text": "经济政策"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/stockforex.html",
|
||||
"text": "股市/外汇"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/investtrade.html",
|
||||
"text": "投资/贸易"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/efinance.html",
|
||||
"text": "金融"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/commodity.html",
|
||||
"text": "大宗商品"
|
||||
},
|
||||
{
|
||||
"parentName": "政经观察",
|
||||
"parentHref": "/politicsaeconomy.html",
|
||||
"href": "/politicsaeconomy/politicsasociety.html",
|
||||
"text": "政治/社会"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/icar.html",
|
||||
"text": "汽车"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/itelectric-appliance.html",
|
||||
"text": "IT/家电"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/ienvironment.html",
|
||||
"text": "环境/能源"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/manufacturing.html",
|
||||
"text": "工业"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/agriculture.html",
|
||||
"text": "农林水产"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/propertiesconstruction.html",
|
||||
"text": "地产/建设"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/tradingretail.html",
|
||||
"text": "商业/消费"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/scienceatechnology.html",
|
||||
"text": "科学/技术"
|
||||
},
|
||||
{
|
||||
"parentName": "产业聚焦",
|
||||
"parentHref": "/industry.html",
|
||||
"href": "/industry/management-strategy.html",
|
||||
"text": "经营/战略"
|
||||
},
|
||||
{
|
||||
"parentName": "新产品",
|
||||
"parentHref": "/product.html",
|
||||
"href": "/product/pdigital.html",
|
||||
"text": "数码与家电"
|
||||
},
|
||||
{
|
||||
"parentName": "新产品",
|
||||
"parentHref": "/product.html",
|
||||
"href": "/product/automobile.html",
|
||||
"text": "汽车"
|
||||
},
|
||||
{
|
||||
"parentName": "新产品",
|
||||
"parentHref": "/product.html",
|
||||
"href": "/product/beautyahealth.html",
|
||||
"text": "美容与健康"
|
||||
},
|
||||
{
|
||||
"parentName": "新产品",
|
||||
"parentHref": "/product.html",
|
||||
"href": "/product/prime-goods.html",
|
||||
"text": "美品精选"
|
||||
},
|
||||
{
|
||||
"parentName": "新产品",
|
||||
"parentHref": "/product.html",
|
||||
"href": "/product/joke-goods.html",
|
||||
"text": "非凡创意"
|
||||
},
|
||||
{
|
||||
"parentName": "穿梭日本",
|
||||
"parentHref": "/trend.html",
|
||||
"href": "/trend/cool-japan.html",
|
||||
"text": "酷日本"
|
||||
},
|
||||
{
|
||||
"parentName": "穿梭日本",
|
||||
"parentHref": "/trend.html",
|
||||
"href": "/trend/beautyahealth.html",
|
||||
"text": "美容健身"
|
||||
},
|
||||
{
|
||||
"parentName": "穿梭日本",
|
||||
"parentHref": "/trend.html",
|
||||
"href": "/trend/traditional-culture.html",
|
||||
"text": "文化精粹"
|
||||
},
|
||||
{
|
||||
"parentName": "穿梭日本",
|
||||
"parentHref": "/trend.html",
|
||||
"href": "/trend/tourism.html",
|
||||
"text": "日本逍遥行"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/tearoom.html",
|
||||
"text": "中日茶坊"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/columns-b.html",
|
||||
"text": "肖敏捷论中日"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/column-special1.html",
|
||||
"text": "日本人小声说"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/liudicolumn.html",
|
||||
"text": "刘迪观察"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/zhangshicolumn.html",
|
||||
"text": "张石的樱雪鸿泥"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/kelongcolumn.html",
|
||||
"text": "老柯要说话"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/criticism.html",
|
||||
"text": "社评"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/viewpoint.html",
|
||||
"text": "观点"
|
||||
},
|
||||
{
|
||||
"parentName": "专栏/观点",
|
||||
"parentHref": "/columnviewpoint.html",
|
||||
"href": "/columnviewpoint/column.html",
|
||||
"text": "专栏"
|
||||
},
|
||||
{
|
||||
"parentName": "职场/深造",
|
||||
"parentHref": "/career.html",
|
||||
"href": "/career/humanresource.html",
|
||||
"text": "人才活用"
|
||||
},
|
||||
{
|
||||
"parentName": "职场/深造",
|
||||
"parentHref": "/career.html",
|
||||
"href": "/career/employment.html",
|
||||
"text": "就业"
|
||||
},
|
||||
{
|
||||
"parentName": "职场/深造",
|
||||
"parentHref": "/career.html",
|
||||
"href": "/career/abroadstudy.html",
|
||||
"text": "留学/教育"
|
||||
}
|
||||
]
|
||||
6
src/crawl/nikkei/main.ts
Normal file
6
src/crawl/nikkei/main.ts
Normal file
@@ -0,0 +1,6 @@
|
||||
// 抓取nikkei网站的新闻
|
||||
|
||||
// import { crawlUrl } from '../constants'
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user