NLPCC 出版部分相关源码记录

目录

Download

Unzip

Author

Title

Affiliation

Check number of tex

Zip

Rename

Delete


Download

import requests
from bs4 import BeautifulSoup

# 登录网站并获取登录后的 session
def login(username, password):
    login_url = 'https://example.com/login'
    session = requests.session()
    login_data = {
        'username': username,
        'password': password,
        # 其他登录参数
    }
    response = session.post(login_url, data=login_data)
    if response.status_code == 200:
        print("登录成功!")
        return session
    else:
        print("登录失败!")
        return None

# 获取文件列表页面中的文件链接
def get_file_links(session, file_list_url):
    response = session.get(file_list_url)
    soup = BeautifulSoup(response.text, 'html.parser')
    file_links = []
    # 使用 BeautifulSoup 解析文件列表页面,获取文件链接
    # 例如:file_links = soup.find_all('a', class_='file-link')
    return file_links

# 批量下载文件
def download_files(session, file_links, download_path):
    for link in file_links:
        file_url = link['href']
        file_name = link.text.strip()
        response = session.get(file_url, stream=True)
        if response.status_code == 200:
            # 保存文件到本地
            with open(f"{download_path}/{file_name}", 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            print(f"{file_name} 下载成功!")
        else:
            print(f"{file_name} 下载失败!")

def main():
    username = 'your_username'
    password = 'your_password'
    file_list_url = 'https://example.com/files'  # 文件列表页面的 URL
    download_path = 'downloaded_files'  # 本地下载路径

    # 登录网站并获取登录后的 session
    session = login(username, password)
    if session:
        # 获取文件列表页面中的文件链接
        file_links = get_file_links(session, file_list_url)
        if file_links:
            # 批量下载文件
            download_files(session, file_links, download_path)
        else:
            print("未找到文件链接!")
    else:
        print("登录失败,请检查用户名和密码!")

# if __name__ == "__main__":
#     main()
import requests
from bs4 import BeautifulSoup

def login(username, password):
    login_url = 'https://softconf.com/nlpcc/Main-2023/login/scmd.cgi?scmd=login'
    session = requests.session()
    login_data = {
    "username": username,
    "password": password
    }
    response = session.post(login_url, data=login_data)
    # print(response.text)
    if response.status_code == 200:
        print("登录成功!")
        return session
    else:
        print("登录失败!")
        return None
username, passwd = "用户名", "密码"
session = login(username, passwd)
import re

ids = {214,215,220,221,222,225,229,233,235,238,239,241,246,250,251,252,254,256,258,260,264,271,285,292,299,301,306,307,308,}
file_list_url = "https://softconf.com/nlpcc/Main-2023/pub/scmd.cgi?scmd=manager&ToPage=monitorFinalSubmissions&FromPage=Main"
response = session.get(file_list_url)
soup = BeautifulSoup(response.text, 'html.parser')
table = soup.find('table', id='t1')
links = table.find_all('a')
all_urls = [link.get('href') for link in links]
urls = []
for i in range(len(all_urls)):
    if all_urls[i] and all_urls[i].startswith('scmd.cgi?scmd=submitPaperCustom'):
        if (m := re.search(r"passcode=(\d+)X-.+", all_urls[i])) is not None:
            # print(m.group(1))
            if int(m.group(1)) in ids:
                urls.append((int(m.group(1)), "https://softconf.com/nlpcc/Main-2023/pub/"+all_urls[i]))
print(len(urls)==len(ids))
print(urls)
import time
import os
from tqdm.auto import tqdm

def download_files(session, urls:dict, paper_id:int):
    for file_name, file_url in urls.items():
        response = session.get(file_url, stream=True)
        save_dir = f"./downloads/{paper_id}/"
        os.makedirs(save_dir, exist_ok=True)
        if response.status_code == 200:
            # 保存文件到本地
            with open(f"{save_dir}/{file_name}", 'wb') as file:
                for chunk in response.iter_content(chunk_size=8192):
                    file.write(chunk)
            # print(f"{paper_id}_{file_name} 下载成功!")
        else:
            print(f"{paper_id}_{file_name} 下载失败!")

for paper_id, url in tqdm(urls):
    response = session.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    links = soup.find_all('a')
    urls_ = map(lambda link: link.get('href') if link else "", links)
    pdf_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=Final_Manuscript")][0]
    zip_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=Source_File")][0]
    copyright_url = [link.get('href') for link in links if link.get('href') and link.get('href').endswith("fieldid=CopyRight_Springer")][0]
    downloads_urls = {"Final_Manuscript.pdf": pdf_url, "Source_File.zip":zip_url, "CopyRight.pdf":copyright_url}
    downloads_urls = {"CopyRight.pdf":copyright_url}
    # print(downloads_urls)
    try:
        download_files(session, downloads_urls, paper_id)
    except:
        pass
    # break
    time.sleep(2)

Unzip

import zipfile
import os
import pathlib

def unzip_file(zip_filepath, dest_path):
    with zipfile.ZipFile(zip_filepath, 'r') as zip_ref:
        zip_ref.extractall(dest_path)

# 使用方法
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():
    try:
        unzip_file(directory/"Source_File.zip", directory/"Source_File")
    except Exception as e:
        print(e)
        print(directory)
    # break
import pathlib

root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():
    path = directory/"Source_File"
    path_true = pathlib.Path(path)
    dir_outputs_tex_true = path_true/"outputs_tex"
    dir_outputs_tex_true.mkdir(exist_ok=True)

    if (path/"submission.tex").exists():
        dir_outputs_tex = pathlib.PurePosixPath("outputs_tex")
        path_tex = pathlib.PurePosixPath("submission.tex")
        path_aux = dir_outputs_tex/"submission.aux"
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
        ! cd {path_true} & bibtex {path_aux}
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
    else:
        print(directory)
def compile2pdf(directory):
    directory = pathlib.Path(directory)
    path = directory/"Source_File"
    path_true = pathlib.Path(path)
    dir_outputs_tex_true = path_true/"outputs_tex"
    dir_outputs_tex_true.mkdir(exist_ok=True)

    if (path/"submission.tex").exists():
        dir_outputs_tex = pathlib.PurePosixPath("outputs_tex")
        path_tex = pathlib.PurePosixPath("submission.tex")
        path_aux = dir_outputs_tex/"submission.aux"
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
        ! cd {path_true} & bibtex {path_aux}
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
        ! cd {path_true} & pdflatex -output-directory={dir_outputs_tex} -synctex=0 -interaction=nonstopmode -file-line-error {path_tex}
    else:
        print(directory)

compile2pdf("downloads/306")
def is_same_file(file1, file2):
    with open(file1, 'rb') as f1, open(file2, 'rb') as f2:
        return f1.read() == f2.read()

import PyPDF2

from PyPDF2 import PdfReader

def extract_text_from_pdf(file_path):
    with open(file_path, 'rb') as file:
        pdf = PdfReader(file)
        text = ""
        for page in range(len(pdf.pages)):
            text += pdf.pages[page].extract_text()
    return text, len(pdf.pages)

def compare_pdfs(file_path1, file_path2):
    text1, n_1 = extract_text_from_pdf(file_path1)
    text2, n_2 = extract_text_from_pdf(file_path2)
    return text1 == text2, n_1, n_2

root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():
    camera_ready = directory/"Final_Manuscript.pdf"
    compiled = directory/"Source_File"/"outputs_tex"/"submission.pdf"
    try: 
        ok, n1, n2 = compare_pdfs(camera_ready, compiled)
        if not ok:
            print(f"Not same: {directory}")
        print(n1, n2, sep='    ')
    except Exception as e:
        print(e)
        print(f"Fail to compare: {directory}")

    print("=========================================================================")

Author

import re

def extract_author(tex_file_path):
    with open(tex_file_path, 'r', encoding='utf-8') as tex_file:
        tex_content = tex_file.read()

    # Use regular expression to find the \author part
    pattern = r"^\\author{\s*(.*?)\s*}\s+\%"
    matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)

    if matches:
        return matches.group(1)
    else:
        return ""


tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
author = extract_author(tex_file_path)

authors = []
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():
    tex_file_path = directory/"Source_File"/"submission.tex"
    print(f"------{directory}---------")
    if tex_file_path.exists():
        author = extract_author(tex_file_path)
        # author = re.sub(r"\\.*", "", author)
        # author = re.sub(r"[^\w\s]", "", author)
        # author = re.sub(r"\s*?\n\s*", ",", author)
        # author = author[:-1] if author.endswith(',') else author
        # author = re.sub(r'(?<=,)(?=[^,]*$)', 'and ', author)  #将最后一个逗号换成 `and`
        # # author = re.sub(r',(?=[^,]*$)', ' and ', author)  #将最后一个逗号换成 `and`
        authors.append(author)
        print(author)
        
    else:
        print(f"Fail to open tex: {tex_file_path}")
        authors.append("")
    print('====================================================================')
import pandas as pd

# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["author"])

# 保存DataFrame到Excel文件
file_path = "./author.xlsx"
df.to_excel(file_path, index=False)

Title

import re

def extract_title(tex_file_path):
    with open(tex_file_path, 'r', encoding='utf-8') as tex_file:
        tex_content = tex_file.read()

    # Use regular expression to find the \author part
    pattern = r"^\\title{\s*(.*?)\s*}\s+\%"
    matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)

    if matches:
        return matches.group(1)
    else:
        return ""


# tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
# author = extract_author(tex_file_path)

authors = []
root_dir = pathlib.Path("./downloads/")
for directory in root_dir.iterdir():
    tex_file_path = directory/"Source_File"/"submission.tex"
    print(f"------{directory}---------")
    if tex_file_path.exists():
        author = extract_title(tex_file_path)
        author = re.sub(r"\s*\\\\\s*", " ", author)
        author = re.sub(r"\\.*", "", author)
        authors.append(author)
        print(author)
        
    else:
        print(f"Fail to open tex: {tex_file_path}")
        authors.append("")
    print('====================================================================')
import pandas as pd

# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["title"])

# 保存DataFrame到Excel文件
file_path = "./title.xlsx"
df.to_excel(file_path, index=False)

Affiliation

import re

def extract_affiliation(tex_file_path):
    with open(tex_file_path, 'r', encoding='utf-8') as tex_file:
        tex_content = tex_file.read()

    # Use regular expression to find the \author part
    pattern = r"^\\institute{\s*(.*?)\s*}\s+\%"
    matches = re.search(pattern, tex_content, re.DOTALL|re.MULTILINE)

    if matches:
        return matches.group(1)
    else:
        return ""


# tex_file_path = "downloads\\215\\Source_File\\submission.tex"  # Replace with the path to your .tex file
# author = extract_author(tex_file_path)

authors = []
root_dir = pathlib.Path("./downloads/")
i = 2
for directory in root_dir.iterdir():
    tex_file_path = directory/"Source_File"/"submission.tex"
    print(f"------{i} {directory}---------")
    i += 1
    if tex_file_path.exists():
        author = extract_affiliation(tex_file_path)
        # author = re.sub(r"\s*\\\\\s*", " ", author)
        # author = re.sub(r"\\.*", "", author)
        authors.append(author)
        print(author)
        
    else:
        print(f"Fail to open tex: {tex_file_path}")
        authors.append("")
    print('====================================================================')
import pandas as pd

# 将列表转换为DataFrame
df = pd.DataFrame(authors, columns=["affiliation"])

# 保存DataFrame到Excel文件
file_path = "./affiliation.xlsx"
df.to_excel(file_path, index=False)

Check number of tex

import pathlib
root_dir = pathlib.Path("./downloads/")

def num_tex(dirctory: pathlib.Path):
    num = 0
    for d in dirctory.iterdir():
        num += (d.suffix=='.tex')
    return num

for d in root_dir.iterdir():
    src = d/"Source_File"
    if num_tex(src)>1:
        print(d)

Zip

import os
import zipfile

def zip_directory(directory_path, zip_path):
    """
    压缩目录到zip文件
    :param directory_path: 要压缩的目录路径
    :param zip_path: zip文件保存路径
    """
    with zipfile.ZipFile(zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, _, files in os.walk(directory_path):
            for file in files:
                file_path = os.path.join(root, file)
                zipf.write(file_path, os.path.relpath(file_path, directory_path))

# # 示例用法
# directory_to_compress = '/path/to/source_directory'
# zip_file_path = '/path/to/destination.zip'
# zip_directory(directory_to_compress, zip_file_path)

Rename

import pathlib
root_dir = pathlib.Path("./downloads/")

for d in list(root_dir.iterdir()):
    src = d/"Source_File"
    zip_directory(src, src.parent/"source.zip")
    submi = d/"Final_Manuscript.pdf"
    submi.rename(submi.with_name("submission.pdf"))
    cprt = d/"CopyRight.pdf"
    cprt.rename(cprt.rename(cprt.with_name("copyright.pdf")))

Delete

import pathlib
import shutil
import os
root_dir = pathlib.Path("./downloads/")

for d in list(root_dir.iterdir()):
    src = d/"Source_File.zip"
    os.remove(src)

你可能感兴趣的:(python)