Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
76 changes: 69 additions & 7 deletions src/aws/lambda/CrawlingService.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { chromium } from 'playwright-core';
import { JobRegistry } from '../../entity/job/JobRegistry';
import { Job } from '../../entity/job/Job';
import { JobExecutor } from '../../entity/job/JobExecutor';
import { getKoreaTimeISO } from '../../utils/DateUtils';
import { getKoreaTimeISO, formatKoreaDateISO } from '../../utils/DateUtils';
import { validateJobName } from './LambdaEventValidator';
import { TargetDate } from '../../entity/TargetDate';
import { HandleErrors } from '../../utils/ErrorHandling';
Expand Down Expand Up @@ -46,18 +46,80 @@ export class CrawlingService {

const endTime = Date.now();
console.log(`Crawling completed in ${endTime - startTime}ms`);

const crawlingResult = this.transformResults(executionResult, job.jobName);

return this.createCrawlingResult(executionResult);
console.log(`스크래핑 결과, items: ${crawlingResult.results.length}`);

return crawlingResult;
} finally {
await this.cleanup();
}
}

private createCrawlingResult(executionResult: { processedJobs: string[]; results: any[]; itemCount: number }): CrawlingResult {
/**
* 기존 중첩 구조를 Spring Batch JsonItemReader가 읽을 수 있는 평면 배열로 변환
* 기존: { '기관명': { 'notice': [...], 'recruit': [...] } }
* 변환: [{ jobName: '기관명', category: 'notice', ...item }, ...]
*/
private transformResults(result: Record<string, any[] | null>, jobName: string): CrawlingResult {
const flatResults: any[] = [];

if (!result) {
return this.createEmptyResult(jobName);
}

for (const [institutionName, categories] of Object.entries(result)) {
if (typeof categories === 'object' && categories !== null) {
for (const [category, items] of Object.entries(categories)) {
if (Array.isArray(items)) {
items.forEach((item) => {
// Date 객체를 한국시간 문자열로 변환
const transformedItem = this.convertDateFieldsToKoreaTime(item);

flatResults.push({
jobName,
institutionName,
category,
crawledAt: getKoreaTimeISO(),
...transformedItem,
});
});
}
}
}
}

console.log(`스크래핑 결과, items: ${flatResults.length}`);

return {
processedJobs: [jobName],
results: flatResults,
itemCount: flatResults.length,
};
}

/**
* 객체 내의 Date 필드들을 한국시간 ISO 문자열로 변환
*/
private convertDateFieldsToKoreaTime(item: any): any {
const converted = { ...item };

// 일반적으로 사용되는 Date 필드들을 확인하고 변환
for (const [key, value] of Object.entries(converted)) {
if (value instanceof Date) {
converted[key] = formatKoreaDateISO(value);
}
}

return converted;
}

private createEmptyResult(jobName: string): CrawlingResult {
return {
processedJobs: executionResult.processedJobs,
results: executionResult.results,
itemCount: executionResult.itemCount,
processedJobs: [jobName],
results: [],
itemCount: 0,
};
}

Expand Down Expand Up @@ -109,7 +171,7 @@ export class CrawlingService {
}

@HandleErrors(OPERATION_CONTEXT.JOB_EXECUTION, ERROR_MESSAGES.JOB_EXECUTION_FAILED)
private async executeJob(job: Job, context: { targetDate: Date }) {
private async executeJob(job: Job, context: { targetDate: Date }): Promise<Record<string, any[]>> {
const result = await this.jobExecutor!.execute(job, context);
return result;
}
Expand Down
8 changes: 4 additions & 4 deletions src/entity/Category.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
export enum Category {
NOTICE = 'NOTICE',
WELFARE = 'WELFARE',
RECRUIT = 'RECRUIT',
EVENT = 'EVENT',
NOTICE = 'NOTICE', // 공지사항
WELFARE = 'WELFARE', // 복지관소식
RECRUIT = 'RECRUIT', // 채용
EVENT = 'EVENT', // 행사/프로그램
}
48 changes: 4 additions & 44 deletions src/entity/job/JobExecutor.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@ import { Job } from './Job';
import { AppError } from '../../errors/AppError';
import { ERROR_MESSAGES } from '../../constants/ErrorMessages';
import { OPERATION_CONTEXT } from '../../constants/OperationContext';
// Removed infra dependencies '../../utils/ErrorHandling';

export interface ExecutionContext {
targetDate: Date;
Expand All @@ -15,12 +14,6 @@ export interface PageOptions {
timeout?: number;
}

export interface JobExecutionResult {
processedJobs: string[];
results: any[];
itemCount: number;
}

/**
* Job 실행을 담당하는 클래스
* 순수 도메인 클래스 - Job 실행과 결과 변환
Expand All @@ -34,23 +27,19 @@ export class JobExecutor {
this.browser = browser;
}

async execute(job: Job, context: ExecutionContext): Promise<JobExecutionResult> {
async execute(job: Job, context: ExecutionContext): Promise<Record<string, any[] | null>> {

console.log(`${job.jobName} Job 실행 시작`);

let page: Page | null = null;

try {
page = await this.createPage(context.pageOptions);
const result = await job.run(page, context.targetDate);
const flatResults = this.transformResults(result, job.jobName);

console.log(`${job.jobName} Job 실행 성공, items: ${flatResults.length}`);
console.log(`${job.jobName} Job 실행 성공`);

return {
processedJobs: [job.jobName],
results: flatResults,
itemCount: flatResults.length,
};
return result;
} catch (error) {
console.warn(`Job execution failed: ${job.jobName}`, error);
throw new AppError(
Expand Down Expand Up @@ -81,33 +70,4 @@ export class JobExecutor {

return page;
}

/**
* 기존 중첩 구조를 Spring Batch JsonItemReader가 읽을 수 있는 평면 배열로 변환
* 기존: { '기관명': { 'notice': [...], 'recruit': [...] } }
* 변환: [{ jobName: '기관명', category: 'notice', ...item }, ...]
*/
private transformResults(result: Record<string, any[]>, jobName: string): any[] {
const flatResults: any[] = [];

for (const [institutionName, categories] of Object.entries(result)) {
if (typeof categories === 'object' && categories !== null) {
for (const [category, items] of Object.entries(categories)) {
if (Array.isArray(items)) {
items.forEach((item) => {
flatResults.push({
jobName,
institutionName,
category,
crawledAt: new Date().toISOString(),
...item,
});
});
}
}
}
}

return flatResults;
}
}
10 changes: 8 additions & 2 deletions src/entity/job/JobRegistry.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { Job } from './Job';
import { 대한의료사회복지사협회 } from './implement/대한의료사회복지사협회';
import { 한국노인인력개발원 } from './implement/한국노인인력개발원';
import { 대한의료사회복지사협회 } from './implement/etc/대한의료사회복지사협회';
import { 한국노인인력개발원 } from './implement/etc/한국노인인력개발원';
import { 경기도사회복지사협회 } from './implement/gyeonggi/경기도사회복지사협회';
import { 오정노인복지기관 } from './implement/gyeonggi/bucheon/부천시노인복지기관포털/오정노인복지관';
import { 원미노인복지관 } from './implement/gyeonggi/bucheon/부천시노인복지기관포털/원미노인복지관';
Expand All @@ -16,6 +16,9 @@ import { 인천광역시장애인종합복지관 } from './implement/incheon/인
import { 인천광역시사회복지사협회 } from './implement/incheon/인천광역시사회복지사협회';
import { 미추홀장애인종합복지관 } from './implement/incheon/미추홀장애인종합복지관';
import { 서울시사회복지사협회 } from './implement/seoul/서울시사회복지사협회';
import { 거모종합사회복지관 } from './implement/gyeonggi/siheung/거모종합사회복지관';
import { 고강종합사회복지관 } from './implement/gyeonggi/bucheon/고강종합사회복지관';
import { 고산종합사회복지관 } from './implement/gyeonggi/uijeongbu/고산종합사회복지관';

/**
* 모든 크롤링 Job들을 등록하고 관리하는 Registry 클래스
Expand All @@ -39,6 +42,9 @@ export class JobRegistry {
new 인천광역시사회복지사협회(),
new 미추홀장애인종합복지관(),
new 서울시사회복지사협회(),
new 거모종합사회복지관(),
new 고강종합사회복지관(),
new 고산종합사회복지관(),
];

static getAllJobs(): Job[] {
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { AbstractJob } from '../AbstractJob';
import { SimpleTemplateStep } from '../../step/SimpleTemplateStep';
import { AbstractJob } from '../../AbstractJob';
import { SimpleTemplateStep } from '../../../step/SimpleTemplateStep';
import { Locator } from 'playwright-core';
import { Category } from '../../Category';
import { parseDate } from '../../../utils/DateUtils';
import { Optimize, Optimizer } from '../../Optimize';
import { Category } from '../../../Category';
import { parseKoreaDate } from '../../../../utils/DateUtils';
import { Optimize, Optimizer } from '../../../Optimize';

export class 대한의료사회복지사협회 extends AbstractJob {
constructor() {
Expand Down Expand Up @@ -31,7 +31,7 @@ class 공지사항 extends SimpleTemplateStep {
const id = this.extractIdUsingStringMethods(link);
const title = (await a.textContent()).trim();
const dateStr = (await card.locator('.time').textContent()).trim();
const createdAt = parseDate(dateStr, '.');
const createdAt = parseKoreaDate(dateStr, '.');

return {
id: parseInt(id),
Expand Down Expand Up @@ -65,7 +65,7 @@ class 채용 extends SimpleTemplateStep {
const id = this.extractIdUsingStringMethods(link);
const title = (await a.textContent()).trim();
const dateStr = (await card.locator('.time').textContent()).trim();
const createdAt = parseDate(dateStr, '.');
const createdAt = parseKoreaDate(dateStr, '.');

return {
id: parseInt(id),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import { AbstractJob } from '../AbstractJob';
import { SimpleTemplateStep } from '../../step/SimpleTemplateStep';
import { Category } from '../../Category';
import { AbstractJob } from '../../AbstractJob';
import { SimpleTemplateStep } from '../../../step/SimpleTemplateStep';
import { Category } from '../../../Category';
import { Locator } from 'playwright-core';
import { parseDate } from '../../../utils/DateUtils';
import { Optimize, Optimizer } from '../../Optimize';
import { parseKoreaDate } from '../../../../utils/DateUtils';
import { Optimize, Optimizer } from '../../../Optimize';

export class 한국노인인력개발원 extends AbstractJob {
constructor() {
Expand All @@ -28,7 +28,7 @@ class 공지사항 extends SimpleTemplateStep {
const title = (await a.textContent()).trim();

const dateStr = (await card.locator('td:nth-child(5)').textContent()).trim();
const createdAt = parseDate(dateStr, '.');
const createdAt = parseKoreaDate(dateStr, '.');

return {
id: parseInt(id),
Expand All @@ -51,7 +51,7 @@ class 채용 extends SimpleTemplateStep {
const title = (await a.textContent()).trim();

const dateStr = (await card.locator('td:nth-child(5)').textContent()).trim();
const createdAt = parseDate(dateStr, '.');
const createdAt = parseKoreaDate(dateStr, '.');

return {
id: parseInt(id),
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { AbstractStep } from '../../../../step/AbstractStep';
import { Page } from 'playwright-core';
import { parseDate } from '../../../../../utils/DateUtils';
import { parseKoreaDate } from '../../../../../utils/DateUtils';
import { Category } from '../../../../Category';

export class OnlyBucheonDefaultStep extends AbstractStep {
Expand Down Expand Up @@ -38,7 +38,7 @@ export class OnlyBucheonDefaultStep extends AbstractStep {

const id = (await card.locator('.cell').first().textContent()).trim();
const title = (await card.locator('.title .tit').textContent()).trim();
const createdAt: Date = parseDate((await card.locator('.date').textContent()).trim());
const createdAt: Date = parseKoreaDate((await card.locator('.date').textContent()).trim());
const link = this.parseOnclick(await card.locator('.tit_cont').getAttribute('onclick'));

list.push({
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import { AbstractStep } from '../../../../step/AbstractStep';
import { Category } from '../../../../Category';
import { Page } from 'playwright-core';
import { parseDate, isEqualOrAfterDateOnly } from '../../../../../utils/DateUtils';
import { parseKoreaDate, isEqualOrAfterDateOnly } from '../../../../../utils/DateUtils';

export class OnlyBucheonImageStep extends AbstractStep {
private readonly category: Category;
Expand Down Expand Up @@ -45,7 +45,7 @@ export class OnlyBucheonImageStep extends AbstractStep {
)
.trim()
.slice(0, 10);
const createdAt = parseDate(dateStr);
const createdAt = parseKoreaDate(dateStr);

if (!isEqualOrAfterDateOnly(syncDate, createdAt)) {
break;
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import { Locator } from "playwright-core";
import { Category } from "../../../../Category";
import { Optimize, Optimizer } from "../../../../Optimize";
import { MultiCategoryTemplateStep } from "../../../../step/MultiCategoryTemplateStep";
import { AbstractJob } from "../../../AbstractJob";
import { parseKoreaDate } from "../../../../../utils/DateUtils";
import { classifyCategory } from "../../../../../utils/CategoryClassifier";

export class 고강종합사회복지관 extends AbstractJob {
constructor() {
super('고강종합사회복지관', 'https://gogangwc.or.kr', [new 알림마당()])
}

registerOptimizer(optimizer: Optimizer) {
optimizer.register(Optimize.JS);
}
}

class 알림마당 extends MultiCategoryTemplateStep {
constructor() {
super(
'https://gogangwc.tistory.com/category/%EC%95%8C%EB%A6%BC%EB%A7%88%EB%8B%B9',
'.index-item.article-item',
Category.NOTICE
);
}

async select(card: Locator, baseUrl: string): Promise<object> {
const a = card.locator('.index-item-link');
const link = await a.getAttribute('href');
const title = (await card.locator('h3').textContent()).trim();

const dateStr = (await card.locator('.digit').textContent()).trim();
const createdAt = parseKoreaDate(dateStr, '. ');

return {
title: title,
createdAt: createdAt,
link: baseUrl + link,
}
}

categorize(data: object): Category | null {
const { title } = data as { title: string };
return classifyCategory(title);
}
}

Loading