본문 바로가기


프로젝트 하면서

LangChain 실습(2)

by worldforest 2025. 12. 26.
from langchain_openai import OpenAIEmbeddings

embedding = OpenAIEmbeddings()

embedding.embed_query('트럼프')


Trump = embedding.embed_query('Donald Trump')
Elon = embedding.embed_query('Elon Reeve Musk')​
from langchain_core.output_parsers import PydanticOutputParser

parser = PydanticOutputParser(pydantic_object=SentimentArticle)

prompt_template = """
당신은 뉴스 기사의 감성을 분석하는 AI입니다.
아래 뉴스를 읽고 감성을 '긍정', '부정', '중립' 중 하나로 분류하고, 감성 점수를 json 형식으로 출력하세요.


{format_instructions}


뉴스 기사:
{news_article}
"""​
from pydantic import BaseModel, RootModel, Field, ValidationError
from typing import List
class PersonInfo(BaseModel):
    name: str = Field(description='사람의 이름')
    age: int = Field(description="사람의 나이")

class PeopleList(RootModel[List[PersonInfo]]):
    root : List[PersonInfo]
data = [
    {'name' : '서찬웅', 'age' : 20}
]

PeopleList.model_validate(data)

from typing import Literaㅣ

class SentimentArticle(BaseModel):
    sentiment: Literal['긍정', '부정', '중립'] = Field(description="감성분석 분류")
    score: float = Field(description='감성분석 점수')
    summary : str = Field(description = "글의 내용을 200자로 요약")
from dotenv import load_dotenv

load_dotenv()​
from langchain_openai import ChatOpenAI

model = "gpt-5-mini-2025-08-07"

llm = ChatOpenAI(
    temperature=0,
    model_name= model
)

 

from langchain_core.prompts import PromptTemplate

prompt = PromptTemplate(
    template=prompt_template,
    input_variables=['news_article'],
    partial_variables={"format_instructions" : parser.get_format_instructions()}
)

chain = prompt | llm | parser
from langchain_community.document_loaders import WebBaseLoader

loader = WebBaseLoader("https://n.news.naver.com/mnews/hotissue/article/015/0005227494?type=series&cid=2003130")

loader.load()
result = chain.invoke(loader.load()[0].page_content.strip())

result.score

import zipfile

path = "data/09.필수의료 의학지식 데이터/3.개방데이터/1.데이터/Training/02.라벨링데이터/TL_내과.zip"

with zipfile.ZipFile(path, "r") as f:
    f.extractall("./data2")

with open("./data2/필수_11.json", 'r', encoding='utf-8-sig') as f:
    text = f.read()
import json

json.loads(text)
class Summary(BaseModel):
    diagnosis : str = Field(description="해당 질병의 진단명")
    cause : str = Field(description="질병의 원인")
    complaint : str = Field(description="질병의 증상")
    treatment : str = Field(description="질병의 치료법")
parser = PydanticOutputParser(pydantic_object=Summary)
prompt = """
    당신은 AI 어시스턴트입니다. 아래 지시사항대로 답변 하세요.

    QUESTION:
    {question}


    FORMAT
    {format_instructions}
    """
    
prompt = PromptTemplate(
    template=prompt,
    input_variables=['question'],
    partial_variables={"format_instructions" : parser.get_format_instructions()}
)
chain = prompt | llm | parser

chain.invoke(" 정답 --> ".join([text_json['question'], text_json['answer']]))

for roots, dirs, files in os.walk("./data2"):
    for file in files:
        print(f"{roots}/{file}")
from tqdm import tqdm
total_text = ""
for roots, dirs, files in os.walk("./data2"):
    for file in tqdm(files):
        # print(f"{roots}/{file}")
        with open(f"{roots}/{file}", 'r', encoding='utf-8-sig') as f:
            text = f.read()
            text_json = json.loads(text)
            total_text += " 정답 --> ".join([text_json['question'], text_json['answer'], "\n"])
from langchain_text_splitters import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=0)

text_splitter = RecursiveCharacterTextSplitter(chunk_size=200, chunk_overlap=50)

result = text_splitter.split_text(total_text)
import numpy as np 
from numpy.linalg import norm
def cosine_sim(a, b):
    a = np.array(a)
    b = np.array(b)
    return np.dot(a, b.T) / (norm(b, axis=0) * norm(a))
cosine_sim(Trump, Elon)
반응형

댓글